In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from fastai.text.all import *
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from gensim import models
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xinyifang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xinyifang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xinyifang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 1. Data
The two datasets that we used for this project were found from Kaggle:

In [2]:
path = Path('/Users/xinyifang/Desktop/sentiment-analysis-main/datasets/')
tweets_covid_all_vaccination = pd.read_csv(path/'tweets_covid_all_vaccination.csv')
tweets_extraction = pd.read_csv(path/'tweets_extraction.csv')

In [3]:
# Remove Emojis Helper
def remove_emojis(str):
    return str.encode('ascii', 'ignore').decode('ascii')

# Remove URLs, Hashtags, handles, and Emojis
def remove(ts, idx='text'):
    ts['orig_text'] = ts[idx]
    ts[idx] = ts[idx].apply(lambda x:re.sub('@[^\s]+','',x))
    ts[idx] = ts[idx].apply(lambda x:re.sub(r"http\S+", "", x))
    ts[idx] = ts[idx].apply(remove_emojis)
    ts[idx] = ts[idx].apply(lambda x:re.sub(r'\B#\S+','',x))
    return ts[ts[idx]!='']

tweets_covid_all_vaccination['sentiment'] = np.nan
tweets_covid_all_vaccination = remove(tweets_covid_all_vaccination)
tweets_extraction = tweets_extraction[['old_text', 'new_sentiment']].rename(columns={'old_text':'text', 'new_sentiment':'sentiment'})
tweets_extraction = remove(tweets_extraction)
tweets_merge = tweets_extraction[['text', 'sentiment']].append(tweets_covid_all_vaccination[['text', 'sentiment']])
tweets = tweets_merge.dropna(subset=['sentiment'])

In [4]:
tweets

Unnamed: 0,text,sentiment
1,Layin n bed with a headache ughhhh...waitin on your call...,negative
2,Funeral ceremony...gloomy friday...,negative
3,wants to hang out with friends SOON!,positive
4,"We want to trade with someone who has Houston tickets, but no one will.",neutral
5,Re-pinging why didn't you go to prom? BC my bf didn't like my friends,negative
...,...,...
39994,Succesfully following Tayla!!,positive
39996,Happy Mothers Day All my love,positive
39997,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",positive
39998,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,neutral


In [5]:
tweets['sentiment'] = tweets['sentiment'].apply(lambda x: np.where(tweets['sentiment'].unique()== x )[0][0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['sentiment'] = tweets['sentiment'].apply(lambda x: np.where(tweets['sentiment'].unique()== x )[0][0])


In [6]:
tweets

Unnamed: 0,text,sentiment
1,Layin n bed with a headache ughhhh...waitin on your call...,0
2,Funeral ceremony...gloomy friday...,0
3,wants to hang out with friends SOON!,1
4,"We want to trade with someone who has Houston tickets, but no one will.",2
5,Re-pinging why didn't you go to prom? BC my bf didn't like my friends,0
...,...,...
39994,Succesfully following Tayla!!,1
39996,Happy Mothers Day All my love,1
39997,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",1
39998,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,2


In [7]:
stopwords_list = stopwords.words('english')
print(stopwords_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
tweets["text"].shape

(31323,)

In [9]:
tweets["text"][:10]

1                 Layin n bed with a headache  ughhhh...waitin on your call...
2                                          Funeral ceremony...gloomy friday...
3                                         wants to hang out with friends SOON!
4      We want to trade with someone who has Houston tickets, but no one will.
5       Re-pinging  why didn't you go to prom? BC my bf didn't like my friends
7                                                               Hmmm.  is down
8                                                 Charlene my love. I miss you
9                                             I'm sorry  at least it's Friday?
10                                                            cant fall asleep
11                                                     Choked on her retainers
Name: text, dtype: object

In [10]:
all_text_tokens = []
for sentence in tweets["text"]:
    content_token = nltk.word_tokenize(sentence)

    lower_token = []
    for token in content_token:
        lower_token.append(token.lower())

    punctuation_token = []
    for token in lower_token:
        punctuation_token.append(re.sub(r'[^\w\s]+', '', token))

    small_token = []
    for token in punctuation_token:
        if len(token)>1:
            small_token.append(token)

    stop_token = []
    for token in small_token:
        if token not in stopwords_list:
            stop_token.append(token)

    lemmatization_token = []
    for token in stop_token:
        lemmatization_token.append(WordNetLemmatizer().lemmatize(token))

    stemming_token = []
    for token in lemmatization_token:
        stemming_token.append(PorterStemmer().stem(token))
    
    all_text_tokens.append(stemming_token)
print(all_text_tokens[:10])

[['layin', 'bed', 'headach', 'ughhhh', 'waitin', 'call'], ['funer', 'ceremoni', 'gloomi', 'friday'], ['want', 'hang', 'friend', 'soon'], ['want', 'trade', 'someon', 'houston', 'ticket', 'one'], ['reping', 'nt', 'go', 'prom', 'bc', 'bf', 'nt', 'like', 'friend'], ['hmmm'], ['charlen', 'love', 'miss'], ['sorri', 'least', 'friday'], ['cant', 'fall', 'asleep'], ['choke', 'retain']]


In [11]:
tweets['text_final'] = [' '.join(text) for text in all_text_tokens]
tweets['text_tokens'] = all_text_tokens
tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['text_final'] = [' '.join(text) for text in all_text_tokens]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['text_tokens'] = all_text_tokens


Unnamed: 0,text,sentiment,text_final,text_tokens
1,Layin n bed with a headache ughhhh...waitin on your call...,0,layin bed headach ughhhh waitin call,"[layin, bed, headach, ughhhh, waitin, call]"
2,Funeral ceremony...gloomy friday...,0,funer ceremoni gloomi friday,"[funer, ceremoni, gloomi, friday]"
3,wants to hang out with friends SOON!,1,want hang friend soon,"[want, hang, friend, soon]"
4,"We want to trade with someone who has Houston tickets, but no one will.",2,want trade someon houston ticket one,"[want, trade, someon, houston, ticket, one]"
5,Re-pinging why didn't you go to prom? BC my bf didn't like my friends,0,reping nt go prom bc bf nt like friend,"[reping, nt, go, prom, bc, bf, nt, like, friend]"
...,...,...,...,...
39994,Succesfully following Tayla!!,1,succes follow tayla,"[succes, follow, tayla]"
39996,Happy Mothers Day All my love,1,happi mother day love,"[happi, mother, day, love]"
39997,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",1,happi mother day mommi woman man long momma someon day,"[happi, mother, day, mommi, woman, man, long, momma, someon, day]"
39998,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,2,wassup beauti follow peep new hit singl wwwmyspacecomipsohot def wat video,"[wassup, beauti, follow, peep, new, hit, singl, wwwmyspacecomipsohot, def, wat, video]"


# 2. Vaccine Tweets Sentiment Analysis
## 1) Baseline Model - Naive Bayes

In [12]:
X = tweets["text_final"]
y = tweets["sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=410)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25058,), (6265,), (25058,), (6265,))

In [14]:
vector_nb = TfidfVectorizer(stop_words = stopwords.words('english')).fit(X)
X_train_nb = vector_nb.transform(X_train)
X_test_nb = vector_nb.transform(X_test)

In [15]:
my_naive_bayes = MultinomialNB()
my_naive_bayes.fit(X_train_nb, y_train)

MultinomialNB()

In [16]:
pred = my_naive_bayes.predict(X_test_nb)
accuracy_score = metrics.accuracy_score(pred, y_test)
print(f"Naive Bayes accuary: {accuracy_score}")

Naive Bayes accuary: 0.6260175578611333


## 2) Baseline Model - XGBoost

In [17]:
vector1 = TfidfVectorizer(stop_words = stopwords.words('english')).fit(X)
X_train1 = vector1.transform(X_train)
X_test1 = vector1.transform(X_test)

XGB_model_1 = XGBClassifier(random_state=410)
XGB_model_1.fit(X_train1, y_train)

print(f"XGBoost train accuary: {XGB_model_1.score(X_train1, y_train)}")
print(f"XGBoost test accuary: {XGB_model_1.score(X_test1, y_test)}")



XGBoost train accuary: 0.76678106792242
XGBoost test accuary: 0.6916201117318436


In [18]:
vector2 = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range=(1, 2)).fit(X)
X_train2 = vector2.transform(X_train)
X_test2 = vector2.transform(X_test)

XGB_model_2 = XGBClassifier(random_state=410)
XGB_model_2.fit(X_train2, y_train)

print(f"XGBoost train accuary: {XGB_model_2.score(X_train2, y_train)}")
print(f"XGBoost test accuary: {XGB_model_2.score(X_test2, y_test)}")



XGBoost train accuary: 0.7686567164179104
XGBoost test accuary: 0.6927374301675978


In [19]:
vector3 = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range=(1, 3)).fit(X)
X_train3 = vector3.transform(X_train)
X_test3 = vector3.transform(X_test)

XGB_model_3 = XGBClassifier(random_state=410)
XGB_model_3.fit(X_train3, y_train)

print(f"XGBoost train accuary: {XGB_model_3.score(X_train3, y_train)}")
print(f"XGBoost test accuary: {XGB_model_3.score(X_test3, y_test)}")



XGBoost train accuary: 0.7692154202250778
XGBoost test accuary: 0.6964086193136473


In [20]:
vector4 = TfidfVectorizer(stop_words = stopwords.words('english'), ngram_range=(1, 4)).fit(X)
X_train4 = vector4.transform(X_train)
X_test4 = vector4.transform(X_test)

XGB_model_4 = XGBClassifier(random_state=410)
XGB_model_4.fit(X_train4, y_train)

print(f"XGBoost train accuary: {XGB_model_4.score(X_train4, y_train)}")
print(f"XGBoost test accuary: {XGB_model_4.score(X_test4, y_test)}")



XGBoost train accuary: 0.76797829036635
XGBoost test accuary: 0.6951316839584996


## 3) Deep-Learning-based Model - CNN

In [21]:
neg = []
pos = []
netrl = []

for senti in tweets['sentiment']:
    # negative
    if senti == 0:
        neg.append(1)
        pos.append(0)
        netrl.append(0)
    # positive
    elif senti == 1:
        neg.append(0)
        pos.append(1)
        netrl.append(0)    
    # neutral
    else:
        neg.append(0)
        pos.append(0)
        netrl.append(1)
        
tweets['neg']= neg
tweets['pos']= pos
tweets['netrl'] = netrl
tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['neg']= neg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['pos']= pos
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets['netrl'] = netrl


Unnamed: 0,text,sentiment,text_final,text_tokens,neg,pos,netrl
1,Layin n bed with a headache ughhhh...waitin on your call...,0,layin bed headach ughhhh waitin call,"[layin, bed, headach, ughhhh, waitin, call]",1,0,0
2,Funeral ceremony...gloomy friday...,0,funer ceremoni gloomi friday,"[funer, ceremoni, gloomi, friday]",1,0,0
3,wants to hang out with friends SOON!,1,want hang friend soon,"[want, hang, friend, soon]",0,1,0
4,"We want to trade with someone who has Houston tickets, but no one will.",2,want trade someon houston ticket one,"[want, trade, someon, houston, ticket, one]",0,0,1
5,Re-pinging why didn't you go to prom? BC my bf didn't like my friends,0,reping nt go prom bc bf nt like friend,"[reping, nt, go, prom, bc, bf, nt, like, friend]",1,0,0
...,...,...,...,...,...,...,...
39994,Succesfully following Tayla!!,1,succes follow tayla,"[succes, follow, tayla]",0,1,0
39996,Happy Mothers Day All my love,1,happi mother day love,"[happi, mother, day, love]",0,1,0
39997,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!",1,happi mother day mommi woman man long momma someon day,"[happi, mother, day, mommi, woman, man, long, momma, someon, day]",0,1,0
39998,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!,2,wassup beauti follow peep new hit singl wwwmyspacecomipsohot def wat video,"[wassup, beauti, follow, peep, new, hit, singl, wwwmyspacecomipsohot, def, wat, video]",0,0,1


In [22]:
data = tweets[['text_final', 'text_tokens', 'sentiment', 'neg', 'pos', 'netrl']]
data.head()

Unnamed: 0,text_final,text_tokens,sentiment,neg,pos,netrl
1,layin bed headach ughhhh waitin call,"[layin, bed, headach, ughhhh, waitin, call]",0,1,0,0
2,funer ceremoni gloomi friday,"[funer, ceremoni, gloomi, friday]",0,1,0,0
3,want hang friend soon,"[want, hang, friend, soon]",1,0,1,0
4,want trade someon houston ticket one,"[want, trade, someon, houston, ticket, one]",2,0,0,1
5,reping nt go prom bc bf nt like friend,"[reping, nt, go, prom, bc, bf, nt, like, friend]",0,1,0,0


In [23]:
# Reference: https://notebook.community/rahulavadhoot/Portfolio/projects/natural%20language%20processing/Disasters%20on%20social%20media/Disasters%20on%20social%20media
train_words = [word for tokens in data["text_tokens"] for word in tokens]
train_vocs = sorted(list(set(train_words)))
print(str(len(train_words)) + " train words")
print(str(len(train_vocs)) + " train vocabularies")

223166 train words
21290 train vocabularies


In [24]:
word2vec = models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [25]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['text_tokens'].apply(lambda x: get_average_word2vec(x, vectors, generate_missing=generate_missing))
    return list(embeddings)

In [26]:
train_embeddings = get_word2vec_embeddings(word2vec, data, generate_missing=True)

In [27]:
text_final_list = data["text_final"].tolist()
tokenizer = Tokenizer(num_words=len(train_vocs), char_level=False)
tokenizer.fit_on_texts(text_final_list)
train_sequences = tokenizer.texts_to_sequences(text_final_list)
train_tokens = tokenizer.word_index
print(str(len(train_tokens)) + " unique tokens after vectorization")

21270 unique tokens after vectorization


In [36]:
MAX_SEQUENCE_LENGTH = 65
EMBEDDING_DIM = 300

In [37]:
train_cnn_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [38]:
train_embedding_weights = np.zeros((len(train_tokens)+1, EMBEDDING_DIM))
for word,index in train_tokens.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(21271, 300)


In [39]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [40]:
y_train = data[['neg', 'pos', 'netrl']].values

In [41]:
x_train = train_cnn_data
y_tr = y_train
print(y_tr)

[[1 0 0]
 [1 0 0]
 [0 1 0]
 ...
 [0 1 0]
 [0 0 1]
 [0 0 1]]


In [42]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_tokens)+1, EMBEDDING_DIM, len(list(['neg', 'pos', 'netrl'])))




Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 65, 300)      6381300     ['input_2[0][0]']                
                                                                                                  
 conv1d_5 (Conv1D)              (None, 64, 200)      120200      ['embedding_1[0][0]']            
                                                                                                  
 conv1d_6 (Conv1D)              (None, 63, 200)      180200      ['embedding_1[0][0]']            
                                                                                            

In [43]:
num_epochs = 3
batch_size = 80
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.2, shuffle=True, batch_size=batch_size)

Epoch 1/3
Epoch 2/3
Epoch 3/3
