In [27]:
import pandas as pd
import re

df = pd.read_json('DubNation_2022_11_03_10ktweet_cleaned.json')
df.head()

Unnamed: 0,Date,User,Tweet
0,2022-10-28 01:44:26,StephenCurry30,Lock in! #DubNation
1,2022-10-28 04:23:49,warriors,All love for #DubNation 💛 https://t.co/r4QZHpdkKd
2,2022-10-28 12:45:30,NBA,Steph had the Chase Center rocking as he dropp...
3,2022-10-29 23:59:44,AustinBrother84,"RT @warriors: #DubNation, you ready?\n\nStream..."
4,2022-10-29 23:59:31,bathie39,RT @StephenCurry30: Lock in! #DubNation


In [28]:
df['clean_text'] = df['Tweet'].str.lower()
df.head()

Unnamed: 0,Date,User,Tweet,clean_text
0,2022-10-28 01:44:26,StephenCurry30,Lock in! #DubNation,lock in! #dubnation
1,2022-10-28 04:23:49,warriors,All love for #DubNation 💛 https://t.co/r4QZHpdkKd,all love for #dubnation 💛 https://t.co/r4qzhpdkkd
2,2022-10-28 12:45:30,NBA,Steph had the Chase Center rocking as he dropp...,steph had the chase center rocking as he dropp...
3,2022-10-29 23:59:44,AustinBrother84,"RT @warriors: #DubNation, you ready?\n\nStream...","rt @warriors: #dubnation, you ready?\n\nstream..."
4,2022-10-29 23:59:31,bathie39,RT @StephenCurry30: Lock in! #DubNation,rt @stephencurry30: lock in! #dubnation


In [29]:
def remove_special_chars(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'#[A-Za-z_]+', ' ', text)
    text = re.sub(r'@([a-zA-Z_]{1,50})', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [30]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,Date,User,Tweet,clean_text
0,2022-10-28 01:44:26,StephenCurry30,Lock in! #DubNation,lock in
1,2022-10-28 04:23:49,warriors,All love for #DubNation 💛 https://t.co/r4QZHpdkKd,all love for
2,2022-10-28 12:45:30,NBA,Steph had the Chase Center rocking as he dropp...,steph had the chase center rocking as he dropp...
3,2022-10-29 23:59:44,AustinBrother84,"RT @warriors: #DubNation, you ready?\n\nStream...",rt you ready stream the game live
4,2022-10-29 23:59:31,bathie39,RT @StephenCurry30: Lock in! #DubNation,rt lock in


In [31]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [32]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [33]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,Date,User,Tweet,clean_text
0,2022-10-28 01:44:26,StephenCurry30,Lock in! #DubNation,lock
1,2022-10-28 04:23:49,warriors,All love for #DubNation 💛 https://t.co/r4QZHpdkKd,love
2,2022-10-28 12:45:30,NBA,Steph had the Chase Center rocking as he dropp...,steph chase center rocking dropped pts second ...
3,2022-10-29 23:59:44,AustinBrother84,"RT @warriors: #DubNation, you ready?\n\nStream...",rt ready stream game live
4,2022-10-29 23:59:31,bathie39,RT @StephenCurry30: Lock in! #DubNation,rt lock


In [34]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [35]:
df['clean_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,Date,User,Tweet,clean_text
0,2022-10-28 01:44:26,StephenCurry30,Lock in! #DubNation,lock
1,2022-10-28 04:23:49,warriors,All love for #DubNation 💛 https://t.co/r4QZHpdkKd,love
2,2022-10-28 12:45:30,NBA,Steph had the Chase Center rocking as he dropp...,steph chase center rock drop pt second half le...
3,2022-10-29 23:59:44,AustinBrother84,"RT @warriors: #DubNation, you ready?\n\nStream...",rt ready stream game live
4,2022-10-29 23:59:31,bathie39,RT @StephenCurry30: Lock in! #DubNation,rt lock


In [36]:
df.to_json('Example_cleaned.json')