Import All necessary modules

In [1]:
import time
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, cross_validate
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score

#import deepcopy
from copy import deepcopy

import nltk
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [2]:
df = pd.read_csv("sentiment_analysis.csv")

In [4]:
df.shape

(550391, 3)

In [6]:
df.head(2)

Unnamed: 0,ID,text,label
0,7.68098e+17,Josh Jenkins is looking forward to TAB Breeder...,1
1,7.68098e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1


#Preprocessing  Data

convert everything to lower case

In [7]:
df['lower']=df['text'].str.lower()

remove HTML, Hashtags, Mentions, Digits, and Emojis

In [12]:
import re
# using the 're.sub' to remove urls.
def remove_urls(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"www.(\w+)", "", text)
    return text
# calling function to remove all urls
df["cleaned_text"]=df["lower"].apply(remove_urls)

 Remove HTML entities

In [14]:
df['cleaned_text']=df['cleaned_text'].str.replace("&amp","")

remove stop words

In [15]:
# import nltk package to find stopwords
import nltk
from nltk.corpus import stopwords  
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Remove Punctuations

In [16]:
# using the 're.sub' to remove punctuation.
import re
def remove_punctuation(text):
    text = re.sub(r"[^\w\s]", "", text)
    return text
df["cleaned_text"] = df["cleaned_text"].apply(remove_punctuation)

remove numbers

In [18]:
def removing_numbers(text):
    return re.sub('[0-9]+', '', text)
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: removing_numbers(x))

Tokenize the data

In [20]:
import nltk
nltk.download('punkt')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: word_tokenize(x))
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,ID,text,label,lower,cleaned_text
0,7.68098e+17,Josh Jenkins is looking forward to TAB Breeder...,1,josh jenkins is looking forward to tab breeder...,"[josh, jenkins, looking, forward, tab, breeder..."
1,7.68098e+17,RT @MianUsmanJaved: Congratulations Pakistan o...,1,rt @mianusmanjaved: congratulations pakistan o...,"[rt, mianusmanjaved, congratulations, pakistan..."
2,7.68098e+17,"RT @PEPalerts: This September, @YESmag is taki...",1,"rt @pepalerts: this september, @yesmag is taki...","[rt, pepalerts, september, yesmag, taking, mai..."
3,7.68098e+17,"RT @david_gaibis: Newly painted walls, thanks ...",1,"rt @david_gaibis: newly painted walls, thanks ...","[rt, david_gaibis, newly, painted, walls, than..."
4,7.68098e+17,RT @CedricFeschotte: Excited to announce: as o...,1,rt @cedricfeschotte: excited to announce: as o...,"[rt, cedricfeschotte, excited, announce, july,..."


Lemmatization of tokens

In [21]:
lemmatizer = nltk.stem.WordNetLemmatizer()
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df['cleaned_text'].head()

0    [josh, jenkins, looking, forward, tab, breeder...
1    [rt, mianusmanjaved, congratulation, pakistan,...
2    [rt, pepalerts, september, yesmag, taking, mai...
3    [rt, david_gaibis, newly, painted, wall, thank...
4    [rt, cedricfeschotte, excited, announce, july,...
Name: cleaned_text, dtype: object

Stemming of tokens

In [22]:
stemmer = nltk.stem.SnowballStemmer('english')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: [stemmer.stem(word) for word in x])
df['cleaned_text'].head()

0    [josh, jenkin, look, forward, tab, breeder, cr...
1    [rt, mianusmanjav, congratul, pakistan, becom,...
2    [rt, pepalert, septemb, yesmag, take, main, me...
3    [rt, david_gaibi, newli, paint, wall, thank, m...
4    [rt, cedricfeschott, excit, announc, juli, fes...
Name: cleaned_text, dtype: object

Update the text column

In [24]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: " ".join(x))
df['cleaned_text'].head()

0    josh jenkin look forward tab breeder crown sup...
1    rt mianusmanjav congratul pakistan becom notes...
2    rt pepalert septemb yesmag take main mendoza s...
3    rt david_gaibi newli paint wall thank million ...
4    rt cedricfeschott excit announc juli feschott ...
Name: cleaned_text, dtype: object

## feature extraction with TF_IDF vectorizer

In [25]:
## Tuned parameters
scale_pos_weight=df['label'].value_counts()[0]/df['label'].value_counts()[1]
# best computational cost to score ratio is at 10000 for xgboost
max_features = 10000
param_grid = {
    'max_depth' : 25,
    'n_estimators' : 100,
    'min_child_weight': 3,
    'subsample' : 0.5,
    'scale_pos_weight' : scale_pos_weight
}



In [26]:
vectorizer = TfidfVectorizer(max_features = max_features)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']


# Training

In [27]:
xgb = XGBClassifier(**param_grid)
xgb.fit(X, y)

XGBClassifier(max_depth=25, min_child_weight=3,
              scale_pos_weight=0.482171373481517, subsample=0.5)

In [29]:
print('Accuracy: ', accuracy_score(y, y_pred))
print('F1 Score: ', f1_score(y, y_pred))

Accuracy:  0.9308818639839678
F1 Score:  0.9469351202963889


#Prediction

In [31]:
df_test=pd.read_csv('/content/aws_vader_textblob_oct_result.csv')

In [32]:
df_test

Unnamed: 0,User,Date_Created,Follows_Count,Friends_Count,Retweet_Count,Language,Date_Tweet,Number_of_Likes,Source_of_Tweet,Tweet_Id,...,Conversation_Id,In_reply_To,Coordinates,Place,Date_Tweet1,lower,cleaned_text,Sentiment,vader,textblob
0,Europaetal,2022-03-22 21:23:26+00:00,31,56,0,en,2022-10-31 23:59:45+00:00,0,Twitter Web App,1587232893734207488,...,1587231188447535105,https://twitter.com/Europaetal,,,2022-10-31 23:59:45+00:00,"""that doesn't mean[..]that #russia can't splin...",that meanthat cant splinterthe humaneconomeffe...,NEGATIVE,NEGATIVE,NEUTRAL
1,marra_ua,2022-08-08 11:41:38+00:00,44,5,1,en,2022-10-31 23:57:36+00:00,3,Twitter for Android,1587232353021214722,...,1587232353021214722,,,,2022-10-31 23:57:36+00:00,ukraine starts work on signing declarations wi...,ukraine starts work signing declarations members,NEUTRAL,NEGATIVE,NEUTRAL
2,Starbright489,2019-07-27 03:10:29+00:00,211,3265,0,en,2022-10-31 23:49:24+00:00,0,Twitter for iPad,1587230288144465922,...,1587230281777520643,https://twitter.com/Starbright489,,,2022-10-31 23:49:24+00:00,"part 2 of 2/\nnov 3, q&amp;a: #west #misconcep...",part nov qa theyll discuss still exclusive...,NEUTRAL,NEGATIVE,NEUTRAL
3,geopol_monitor,2013-08-22 09:51:47+00:00,2998,315,0,en,2022-10-31 23:49:18+00:00,0,Twitter Web App,1587230264295424001,...,1587230264295424001,,,,2022-10-31 23:49:18+00:00,"lula's top foreign policy advisor: ""lula oppos...",lulas top foreign policy advisor lula opposes ...,NEUTRAL,NEGATIVE,POSITIVE
4,PCiesa,2021-01-07 01:13:39+00:00,16,210,0,en,2022-10-31 23:46:43+00:00,1,Twitter Web App,1587229612652003328,...,1587229612652003328,,,,2022-10-31 23:46:43+00:00,what is striking is ukraine wants this to be a...,striking ukraine wants visual media war whereu...,NEGATIVE,NEGATIVE,POSITIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32647,paroliro,2022-09-12 14:14:38+00:00,11,75,1,en,2022-10-01 00:07:05+00:00,3,Twitter Web App,1576000715402907648,...,1576000715402907648,,,,2022-10-01 00:07:05+00:00,new york city's russian embassy on the upper ...,new york citys russian embassy upper east side...,NEUTRAL,NEGATIVE,POSITIVE
32648,tvmario,2009-09-02 15:23:29+00:00,2408,2963,1,en,2022-10-01 00:06:48+00:00,3,Twitter Web App,1576000642430083073,...,1576000642430083073,,,,2022-10-01 00:06:48+00:00,the war in #ukraine is still going on because ...,war still going rich western world countries f...,NEGATIVE,NEGATIVE,POSITIVE
32649,Writing_Destiny,2009-04-28 10:18:39+00:00,2704,4989,0,en,2022-10-01 00:04:03+00:00,1,Twitter Web App,1575999950495698945,...,1575999950495698945,,,,2022-10-01 00:04:03+00:00,the russian people are ashamed. the fake anne...,russian people ashamed fake annexation list th...,NEGATIVE,NEGATIVE,POSITIVE
32650,besthealthyou,2013-07-06 19:54:34+00:00,450,66,0,en,2022-10-01 00:03:35+00:00,0,Twitter Web App,1575999832719708160,...,1575999832719708160,,,,2022-10-01 00:03:35+00:00,15% of the land that supposedly was under #ukr...,land supposedly part understood correctly go...,NEGATIVE,NEGATIVE,NEGATIVE


In [41]:
df_test=df_test.dropna(subset=['cleaned_text'])

In [42]:
df_test

Unnamed: 0,User,Date_Created,Follows_Count,Friends_Count,Retweet_Count,Language,Date_Tweet,Number_of_Likes,Source_of_Tweet,Tweet_Id,...,Conversation_Id,In_reply_To,Coordinates,Place,Date_Tweet1,lower,cleaned_text,Sentiment,vader,textblob
0,Europaetal,2022-03-22 21:23:26+00:00,31,56,0,en,2022-10-31 23:59:45+00:00,0,Twitter Web App,1587232893734207488,...,1587231188447535105,https://twitter.com/Europaetal,,,2022-10-31 23:59:45+00:00,"""that doesn't mean[..]that #russia can't splin...",that meanthat cant splinterthe humaneconomeffe...,NEGATIVE,NEGATIVE,NEUTRAL
1,marra_ua,2022-08-08 11:41:38+00:00,44,5,1,en,2022-10-31 23:57:36+00:00,3,Twitter for Android,1587232353021214722,...,1587232353021214722,,,,2022-10-31 23:57:36+00:00,ukraine starts work on signing declarations wi...,ukraine starts work signing declarations members,NEUTRAL,NEGATIVE,NEUTRAL
2,Starbright489,2019-07-27 03:10:29+00:00,211,3265,0,en,2022-10-31 23:49:24+00:00,0,Twitter for iPad,1587230288144465922,...,1587230281777520643,https://twitter.com/Starbright489,,,2022-10-31 23:49:24+00:00,"part 2 of 2/\nnov 3, q&amp;a: #west #misconcep...",part nov qa theyll discuss still exclusive...,NEUTRAL,NEGATIVE,NEUTRAL
3,geopol_monitor,2013-08-22 09:51:47+00:00,2998,315,0,en,2022-10-31 23:49:18+00:00,0,Twitter Web App,1587230264295424001,...,1587230264295424001,,,,2022-10-31 23:49:18+00:00,"lula's top foreign policy advisor: ""lula oppos...",lulas top foreign policy advisor lula opposes ...,NEUTRAL,NEGATIVE,POSITIVE
4,PCiesa,2021-01-07 01:13:39+00:00,16,210,0,en,2022-10-31 23:46:43+00:00,1,Twitter Web App,1587229612652003328,...,1587229612652003328,,,,2022-10-31 23:46:43+00:00,what is striking is ukraine wants this to be a...,striking ukraine wants visual media war whereu...,NEGATIVE,NEGATIVE,POSITIVE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32647,paroliro,2022-09-12 14:14:38+00:00,11,75,1,en,2022-10-01 00:07:05+00:00,3,Twitter Web App,1576000715402907648,...,1576000715402907648,,,,2022-10-01 00:07:05+00:00,new york city's russian embassy on the upper ...,new york citys russian embassy upper east side...,NEUTRAL,NEGATIVE,POSITIVE
32648,tvmario,2009-09-02 15:23:29+00:00,2408,2963,1,en,2022-10-01 00:06:48+00:00,3,Twitter Web App,1576000642430083073,...,1576000642430083073,,,,2022-10-01 00:06:48+00:00,the war in #ukraine is still going on because ...,war still going rich western world countries f...,NEGATIVE,NEGATIVE,POSITIVE
32649,Writing_Destiny,2009-04-28 10:18:39+00:00,2704,4989,0,en,2022-10-01 00:04:03+00:00,1,Twitter Web App,1575999950495698945,...,1575999950495698945,,,,2022-10-01 00:04:03+00:00,the russian people are ashamed. the fake anne...,russian people ashamed fake annexation list th...,NEGATIVE,NEGATIVE,POSITIVE
32650,besthealthyou,2013-07-06 19:54:34+00:00,450,66,0,en,2022-10-01 00:03:35+00:00,0,Twitter Web App,1575999832719708160,...,1575999832719708160,,,,2022-10-01 00:03:35+00:00,15% of the land that supposedly was under #ukr...,land supposedly part understood correctly go...,NEGATIVE,NEGATIVE,NEGATIVE


In [40]:

df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: word_tokenize(x))
df.head()

TypeError: ignored

In [43]:
X_test = vectorizer.transform(df_test['cleaned_text'])

In [44]:
y_pred = xgb.predict(X_test)

In [48]:
len(y_pred)

31650

In [49]:
len(df_test)

31650

In [46]:
df_test['xgb']=y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['xgb']=y_pred


In [47]:
df_

Unnamed: 0,User,Date_Created,Follows_Count,Friends_Count,Retweet_Count,Language,Date_Tweet,Number_of_Likes,Source_of_Tweet,Tweet_Id,...,In_reply_To,Coordinates,Place,Date_Tweet1,lower,cleaned_text,Sentiment,vader,textblob,xgb
0,Europaetal,2022-03-22 21:23:26+00:00,31,56,0,en,2022-10-31 23:59:45+00:00,0,Twitter Web App,1587232893734207488,...,https://twitter.com/Europaetal,,,2022-10-31 23:59:45+00:00,"""that doesn't mean[..]that #russia can't splin...",that meanthat cant splinterthe humaneconomeffe...,NEGATIVE,NEGATIVE,NEUTRAL,0
1,marra_ua,2022-08-08 11:41:38+00:00,44,5,1,en,2022-10-31 23:57:36+00:00,3,Twitter for Android,1587232353021214722,...,,,,2022-10-31 23:57:36+00:00,ukraine starts work on signing declarations wi...,ukraine starts work signing declarations members,NEUTRAL,NEGATIVE,NEUTRAL,0
2,Starbright489,2019-07-27 03:10:29+00:00,211,3265,0,en,2022-10-31 23:49:24+00:00,0,Twitter for iPad,1587230288144465922,...,https://twitter.com/Starbright489,,,2022-10-31 23:49:24+00:00,"part 2 of 2/\nnov 3, q&amp;a: #west #misconcep...",part nov qa theyll discuss still exclusive...,NEUTRAL,NEGATIVE,NEUTRAL,0
3,geopol_monitor,2013-08-22 09:51:47+00:00,2998,315,0,en,2022-10-31 23:49:18+00:00,0,Twitter Web App,1587230264295424001,...,,,,2022-10-31 23:49:18+00:00,"lula's top foreign policy advisor: ""lula oppos...",lulas top foreign policy advisor lula opposes ...,NEUTRAL,NEGATIVE,POSITIVE,0
4,PCiesa,2021-01-07 01:13:39+00:00,16,210,0,en,2022-10-31 23:46:43+00:00,1,Twitter Web App,1587229612652003328,...,,,,2022-10-31 23:46:43+00:00,what is striking is ukraine wants this to be a...,striking ukraine wants visual media war whereu...,NEGATIVE,NEGATIVE,POSITIVE,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32647,paroliro,2022-09-12 14:14:38+00:00,11,75,1,en,2022-10-01 00:07:05+00:00,3,Twitter Web App,1576000715402907648,...,,,,2022-10-01 00:07:05+00:00,new york city's russian embassy on the upper ...,new york citys russian embassy upper east side...,NEUTRAL,NEGATIVE,POSITIVE,0
32648,tvmario,2009-09-02 15:23:29+00:00,2408,2963,1,en,2022-10-01 00:06:48+00:00,3,Twitter Web App,1576000642430083073,...,,,,2022-10-01 00:06:48+00:00,the war in #ukraine is still going on because ...,war still going rich western world countries f...,NEGATIVE,NEGATIVE,POSITIVE,0
32649,Writing_Destiny,2009-04-28 10:18:39+00:00,2704,4989,0,en,2022-10-01 00:04:03+00:00,1,Twitter Web App,1575999950495698945,...,,,,2022-10-01 00:04:03+00:00,the russian people are ashamed. the fake anne...,russian people ashamed fake annexation list th...,NEGATIVE,NEGATIVE,POSITIVE,0
32650,besthealthyou,2013-07-06 19:54:34+00:00,450,66,0,en,2022-10-01 00:03:35+00:00,0,Twitter Web App,1575999832719708160,...,,,,2022-10-01 00:03:35+00:00,15% of the land that supposedly was under #ukr...,land supposedly part understood correctly go...,NEGATIVE,NEGATIVE,NEGATIVE,0


In [50]:
df_test.reset_index()

Unnamed: 0,index,User,Date_Created,Follows_Count,Friends_Count,Retweet_Count,Language,Date_Tweet,Number_of_Likes,Source_of_Tweet,...,In_reply_To,Coordinates,Place,Date_Tweet1,lower,cleaned_text,Sentiment,vader,textblob,xgb
0,0,Europaetal,2022-03-22 21:23:26+00:00,31,56,0,en,2022-10-31 23:59:45+00:00,0,Twitter Web App,...,https://twitter.com/Europaetal,,,2022-10-31 23:59:45+00:00,"""that doesn't mean[..]that #russia can't splin...",that meanthat cant splinterthe humaneconomeffe...,NEGATIVE,NEGATIVE,NEUTRAL,0
1,1,marra_ua,2022-08-08 11:41:38+00:00,44,5,1,en,2022-10-31 23:57:36+00:00,3,Twitter for Android,...,,,,2022-10-31 23:57:36+00:00,ukraine starts work on signing declarations wi...,ukraine starts work signing declarations members,NEUTRAL,NEGATIVE,NEUTRAL,0
2,2,Starbright489,2019-07-27 03:10:29+00:00,211,3265,0,en,2022-10-31 23:49:24+00:00,0,Twitter for iPad,...,https://twitter.com/Starbright489,,,2022-10-31 23:49:24+00:00,"part 2 of 2/\nnov 3, q&amp;a: #west #misconcep...",part nov qa theyll discuss still exclusive...,NEUTRAL,NEGATIVE,NEUTRAL,0
3,3,geopol_monitor,2013-08-22 09:51:47+00:00,2998,315,0,en,2022-10-31 23:49:18+00:00,0,Twitter Web App,...,,,,2022-10-31 23:49:18+00:00,"lula's top foreign policy advisor: ""lula oppos...",lulas top foreign policy advisor lula opposes ...,NEUTRAL,NEGATIVE,POSITIVE,0
4,4,PCiesa,2021-01-07 01:13:39+00:00,16,210,0,en,2022-10-31 23:46:43+00:00,1,Twitter Web App,...,,,,2022-10-31 23:46:43+00:00,what is striking is ukraine wants this to be a...,striking ukraine wants visual media war whereu...,NEGATIVE,NEGATIVE,POSITIVE,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31645,32647,paroliro,2022-09-12 14:14:38+00:00,11,75,1,en,2022-10-01 00:07:05+00:00,3,Twitter Web App,...,,,,2022-10-01 00:07:05+00:00,new york city's russian embassy on the upper ...,new york citys russian embassy upper east side...,NEUTRAL,NEGATIVE,POSITIVE,0
31646,32648,tvmario,2009-09-02 15:23:29+00:00,2408,2963,1,en,2022-10-01 00:06:48+00:00,3,Twitter Web App,...,,,,2022-10-01 00:06:48+00:00,the war in #ukraine is still going on because ...,war still going rich western world countries f...,NEGATIVE,NEGATIVE,POSITIVE,0
31647,32649,Writing_Destiny,2009-04-28 10:18:39+00:00,2704,4989,0,en,2022-10-01 00:04:03+00:00,1,Twitter Web App,...,,,,2022-10-01 00:04:03+00:00,the russian people are ashamed. the fake anne...,russian people ashamed fake annexation list th...,NEGATIVE,NEGATIVE,POSITIVE,0
31648,32650,besthealthyou,2013-07-06 19:54:34+00:00,450,66,0,en,2022-10-01 00:03:35+00:00,0,Twitter Web App,...,,,,2022-10-01 00:03:35+00:00,15% of the land that supposedly was under #ukr...,land supposedly part understood correctly go...,NEGATIVE,NEGATIVE,NEGATIVE,0


In [52]:
df_test.to_csv("aws_vader_textblob_xgb" + "_" + "oct" + "_" + "result" + ".csv", index=False)