Imports

In [11]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
df = pd.read_csv('tweet_data.csv')
df.head()

Unnamed: 0,textID,tweet_text,sentiment
0,1956967666,Layin n bed with a headache ughhhh...waitin o...,negative
1,1956967696,Funeral ceremony...gloomy friday...,negative
2,1956967789,wants to hang out with friends SOON!,positive
3,1956968477,Re-pinging @ghostridah14: why didn't you go to...,negative
4,1956968636,Hmmm. http://www.djhero.com/ is down,negative


In [13]:
df['tweet_text'] = df['tweet_text'].str.lower()
df['sentiment'] = df['sentiment'].str.lower()
df.head()

Unnamed: 0,textID,tweet_text,sentiment
0,1956967666,layin n bed with a headache ughhhh...waitin o...,negative
1,1956967696,funeral ceremony...gloomy friday...,negative
2,1956967789,wants to hang out with friends soon!,positive
3,1956968477,re-pinging @ghostridah14: why didn't you go to...,negative
4,1956968636,hmmm. http://www.djhero.com/ is down,negative


In [14]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

In [15]:
df['tweet_text'] = df['tweet_text'].str.replace(r'http\S+', '', regex=True)
df['tweet_text'] = df['tweet_text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
df['tweet_text'] = df['tweet_text'].str.replace(r'\s+', ' ', regex=True)
df['tweet_text'] = df['tweet_text'].str.replace(r'\d+', '', regex=True)
df['tweet_text'] = df['tweet_text'].str.strip()
df.head()

Unnamed: 0,textID,tweet_text,sentiment
0,1956967666,layin n bed with a headache ughhhhwaitin on yo...,negative
1,1956967696,funeral ceremonygloomy friday,negative
2,1956967789,wants to hang out with friends soon,positive
3,1956968477,repinging ghostridah why didnt you go to prom ...,negative
4,1956968636,hmmm is down,negative


In [16]:
df['tokens'] = df['tweet_text'].apply(word_tokenize)
df.head()

Unnamed: 0,textID,tweet_text,sentiment,tokens
0,1956967666,layin n bed with a headache ughhhhwaitin on yo...,negative,"[layin, n, bed, with, a, headache, ughhhhwaiti..."
1,1956967696,funeral ceremonygloomy friday,negative,"[funeral, ceremonygloomy, friday]"
2,1956967789,wants to hang out with friends soon,positive,"[wants, to, hang, out, with, friends, soon]"
3,1956968477,repinging ghostridah why didnt you go to prom ...,negative,"[repinging, ghostridah, why, didnt, you, go, t..."
4,1956968636,hmmm is down,negative,"[hmmm, is, down]"


In [17]:
df['tokens'] = df['tokens'].apply(lambda x: [item for item in x if item not in stopwords])
df.head()

Unnamed: 0,textID,tweet_text,sentiment,tokens
0,1956967666,layin n bed with a headache ughhhhwaitin on yo...,negative,"[layin, n, bed, headache, ughhhhwaitin, call]"
1,1956967696,funeral ceremonygloomy friday,negative,"[funeral, ceremonygloomy, friday]"
2,1956967789,wants to hang out with friends soon,positive,"[wants, hang, friends, soon]"
3,1956968477,repinging ghostridah why didnt you go to prom ...,negative,"[repinging, ghostridah, didnt, go, prom, bc, b..."
4,1956968636,hmmm is down,negative,[hmmm]


In [18]:
df['stemmed'] = df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])
df.head()

Unnamed: 0,textID,tweet_text,sentiment,tokens,stemmed
0,1956967666,layin n bed with a headache ughhhhwaitin on yo...,negative,"[layin, n, bed, headache, ughhhhwaitin, call]","[layin, n, bed, headach, ughhhhwaitin, call]"
1,1956967696,funeral ceremonygloomy friday,negative,"[funeral, ceremonygloomy, friday]","[funer, ceremonygloomi, friday]"
2,1956967789,wants to hang out with friends soon,positive,"[wants, hang, friends, soon]","[want, hang, friend, soon]"
3,1956968477,repinging ghostridah why didnt you go to prom ...,negative,"[repinging, ghostridah, didnt, go, prom, bc, b...","[reping, ghostridah, didnt, go, prom, bc, bf, ..."
4,1956968636,hmmm is down,negative,[hmmm],[hmmm]


In [19]:
df['lemmatized'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
df.head()

Unnamed: 0,textID,tweet_text,sentiment,tokens,stemmed,lemmatized
0,1956967666,layin n bed with a headache ughhhhwaitin on yo...,negative,"[layin, n, bed, headache, ughhhhwaitin, call]","[layin, n, bed, headach, ughhhhwaitin, call]","[layin, n, bed, headache, ughhhhwaitin, call]"
1,1956967696,funeral ceremonygloomy friday,negative,"[funeral, ceremonygloomy, friday]","[funer, ceremonygloomi, friday]","[funeral, ceremonygloomy, friday]"
2,1956967789,wants to hang out with friends soon,positive,"[wants, hang, friends, soon]","[want, hang, friend, soon]","[want, hang, friend, soon]"
3,1956968477,repinging ghostridah why didnt you go to prom ...,negative,"[repinging, ghostridah, didnt, go, prom, bc, b...","[reping, ghostridah, didnt, go, prom, bc, bf, ...","[repinging, ghostridah, didnt, go, prom, bc, b..."
4,1956968636,hmmm is down,negative,[hmmm],[hmmm],[hmmm]


In [None]:
cv = CountVectorizer(ngram_range=(2, 2))
bigrams = cv.fit_transform(df['tweet_text'])
bigrams = bigrams.toarray()
bigrams = pd.DataFrame(bigrams, columns=cv.get_feature_names_out())
bigrams = bigrams.sum().sort_values(ascending=False)
bigrams.head()


mothers day      671
in the           541
happy mothers    538
going to         476
to be            390
dtype: Sparse[int64, 0]

In [21]:
cv = CountVectorizer(ngram_range=(3, 3))
trigrams = cv.fit_transform(df['tweet_text'])
trigrams = pd.DataFrame.sparse.from_spmatrix(trigrams, columns=cv.get_feature_names_out())
trigrams = trigrams.sum().sort_values(ascending=False)
trigrams.head()

happy mothers day    524
mothers day to       196
day to all           151
to go to             114
thanks for the        94
dtype: Sparse[int64, 0]

In [22]:
#bag of words
cv = CountVectorizer()
cv.fit(df['tweet_text'])
X = cv.transform(df['tweet_text'])
X = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out())
X.head()


Unnamed: 0,aaaa,aaaaa,aaaaaaaa,aaaaaaaaaahhhhhhhh,aaaaaaaaaamazing,aaaaaaaafternoon,aaaaaalcohol,aaaaahhhh,aaaaall,aaaaaoouoouoouu,...,zurieventsllc,zuzu,zwriter,zykloid,zyrtec,zzwhitejd,zzzz,zzzzy,zzzzzzzgoodnight,zzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#tf-idf
tfidf = TfidfVectorizer(max_features=1000)  # Limiting the number of features to 1000
tfidf.fit(df['tweet_text'])
X = tfidf.transform(df['tweet_text'])
X = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
X.head()

Unnamed: 0,able,about,absolutely,account,actually,afraid,after,afternoon,again,ago,...,youll,your,youre,yours,yourself,youtube,youve,yr,yum,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.339441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
