# PROJECT-1 SENTIMENT ANALYSIS OF TWITTER TWEETS

#### IMPORTING NLTK FOR NLP

In [44]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import re

#### LOADING THE TRAINING AND TESTING DATASETS

In [45]:
df_train = pd.read_csv("twitter_training.csv", names=['Tweet ID','Game Name','Label','Text'])
df_test = pd.read_csv("twitter_validation.csv", names=['Tweet ID','Game Name','Label','Text'])

In [46]:
df_train.head()

Unnamed: 0,Tweet ID,Game Name,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [47]:
df_test.head()

Unnamed: 0,Tweet ID,Game Name,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [48]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet ID   74682 non-null  int64 
 1   Game Name  74682 non-null  object
 2   Label      74682 non-null  object
 3   Text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [49]:
df_train.isna().sum()

Tweet ID       0
Game Name      0
Label          0
Text         686
dtype: int64

In [50]:
df_test.isna().sum()

Tweet ID     0
Game Name    0
Label        0
Text         0
dtype: int64

#### DROPPING NA VALUES

In [51]:
# it is not possible to fill up empty tweets, so lets drop these rows
df_train.dropna(axis=0, inplace=True)
df_train.isna().sum()

Tweet ID     0
Game Name    0
Label        0
Text         0
dtype: int64

In [52]:
df_train.shape

(73996, 4)

In [53]:
df_test.shape

(1000, 4)

#### DROPPING USELESS COLUMNS

In [54]:
# DROP USELESS COLUMNS
df_train.drop(columns=['Tweet ID','Game Name'], inplace=True)
df_test.drop(columns=['Tweet ID','Game Name'], inplace=True)

#### GROUPING THE DATA ON LABEL VALUES

##### FOR TRAINING DATA

In [55]:
df_train['Label'].value_counts()

Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: Label, dtype: int64

In [56]:
# SINCE WE REGARD IRRELEVANT CLASSES AS NEUTRAL, LETS CHANGE SENTIMENT TO NEUTRAL FOR THEM
df_train['Label'].replace(['Irrelevant'],['Neutral'], inplace=True)
df_train['Label'].value_counts()

Neutral     30983
Negative    22358
Positive    20655
Name: Label, dtype: int64

##### FOR TESTING DATA

In [57]:
df_test['Label'].value_counts()

Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: Label, dtype: int64

In [58]:
# REPLACING IRRELEVANT WITH NEUTRAL
df_test['Label'].replace(['Irrelevant'],['Neutral'], inplace=True)
df_test['Label'].value_counts()

Neutral     457
Positive    277
Negative    266
Name: Label, dtype: int64

#### CREATING A FUNCTION TO PREPROCESS TEXT IN TEXT COLUMN

In [59]:
def preprocess(text):
    # string and regex preprocessing

    # converting text to lower
    text = text.lower()

    # removing only digits
    text = re.sub('\d','',text)

    # removing @ words
    text = re.sub(r'@\s?\w+','',text)

    # removing links from text
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub(r'\spic\.twitter\.com\s*/\s*\S+','',text)
    text = re.sub(r"dlvr\.it\s*/\s*\S+|dfr\.it\s*/\s*\S+", "", text)
    text = re.sub(r'\spic\.wikipedia\.org\s*/\s*\S+','',text)
    
    # removing special characters
    text = re.sub('\[.*?\,]','',text)

    # remove punctuation
    text = re.sub('[^a-zA-Z0-9\s]+', '', text)

    # remove b2c type words (words containing numbers)
    text = re.sub('\w\d\w','',text)

    # nltk preprocessing
    words = word_tokenize(text)
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    words_cleaned = [word for word in words if not word in stop_words]

    for i in range(0,len(words_cleaned)-1):
        #stemming the words
        words_cleaned[i]= ps.stem(words_cleaned[i])

    text = ' '.join(words_cleaned)  
    
    # removing extra whitespaces if any
    text = re.sub('\s+',' ',text).strip()
    return text

## PREPROCESSING THE DATA

In [60]:
df_train['Text'] = df_train['Text'].apply(preprocess)
df_test['Text'] = df_test['Text'].apply(preprocess)

In [61]:
df_train.head()

Unnamed: 0,Label,Text
0,Positive,im get borderland murder
1,Positive,come border kill
2,Positive,im get borderland kill
3,Positive,im come borderland murder
4,Positive,im get borderland murder


In [62]:
df_test.head()

Unnamed: 0,Label,Text
0,Neutral,mention facebook struggl motiv go run day tran...
1,Neutral,bbc news amazon boss jeff bezo reject claim co...
2,Negative,pay word function poorli chromebook
3,Negative,csgo matchmak full closet hack truli aw game
4,Neutral,presid slap american face realli commit unlaw ...


#### ONE HOT ENCODING MANUALLY FOR THE LABEL VALUES

In [63]:
df_train['Label'].replace(['Neutral','Positive','Negative'],[0, 1, -1], inplace=True)
df_test['Label'].replace(['Neutral','Positive','Negative'],[0, 1, -1], inplace=True)

In [64]:
df_train["Label"].unique()

array([ 1,  0, -1], dtype=int64)

In [65]:
df_test["Label"].unique()

array([ 0, -1,  1], dtype=int64)

#### IMPORTING NECESSARY LIBRARIES

In [66]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#### DEFINING THE TRAINING AND TESTING DATASETS

In [67]:
X = df_train['Text']
y = df_train["Label"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.05, random_state=1)

X_testfinal = df_test['Text']
y_testfinal = df_test['Label']

#### CONVERTING TEXT COLUMN INTO NUMERICAL FORMAT FOR MACHINE LEARNING

In [68]:
vector = TfidfVectorizer().fit(X)

X_train_vectorized = vector.transform(X_train)
X_test_vectorized = vector.transform(X_test)
X_testfinal_vectorized = vector.transform(X_testfinal)

## CREATING A PIPELINE AND SCORING METHOD

In [69]:
pipe = Pipeline([
    ('model', None)
])
score = ['accuracy']

### LOGISTIC REGRESSION MODEL

In [None]:
param_grid_lr = [{
    'model' : [LogisticRegression()],
    'model__penalty': ['l1', 'l2'],  
    'model__C': [10,20,30,40,100],  
    'model__solver': ['liblinear'] 
}]

grid_search_lr = GridSearchCV(pipe, param_grid=param_grid_lr, scoring=score, cv=30, refit='accuracy')
grid_search_lr.fit(X_train_vectorized, y_train)

In [73]:
print(grid_search_lr.best_estimator_)
print(grid_search_lr.best_score_)
print(grid_search_lr.best_params_)
pd.DataFrame(grid_search_lr.cv_results_).head()

Pipeline(steps=[('model', LogisticRegression(C=100, solver='liblinear'))])
0.847487881749767
{'model': LogisticRegression(C=100, solver='liblinear'), 'model__C': 100, 'model__penalty': 'l2', 'model__solver': 'liblinear'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__C,param_model__penalty,param_model__solver,params,split0_test_accuracy,...,split23_test_accuracy,split24_test_accuracy,split25_test_accuracy,split26_test_accuracy,split27_test_accuracy,split28_test_accuracy,split29_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,1.816621,0.181421,0.001623,0.001212,"LogisticRegression(C=100, solver='liblinear')",10,l1,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.834471,...,0.8344,0.840376,0.832693,0.840802,0.833547,0.840802,0.828852,0.839763,0.006572,5
1,3.076898,0.537444,0.002126,0.002747,"LogisticRegression(C=100, solver='liblinear')",10,l2,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.826365,...,0.838242,0.837388,0.827145,0.833547,0.829279,0.837815,0.820316,0.834301,0.007705,9
2,2.127244,0.274367,0.001031,0.003066,"LogisticRegression(C=100, solver='liblinear')",20,l1,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.836604,...,0.828852,0.84251,0.836961,0.839095,0.840376,0.833547,0.83184,0.839166,0.006201,6
3,4.32061,0.578385,0.001886,0.00338,"LogisticRegression(C=100, solver='liblinear')",20,l2,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.830205,...,0.843363,0.844217,0.835254,0.841229,0.840376,0.839522,0.833547,0.842367,0.007717,4
4,2.353102,0.278512,0.001228,0.000951,"LogisticRegression(C=100, solver='liblinear')",30,l1,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.836604,...,0.828852,0.845924,0.830986,0.839522,0.83312,0.827998,0.832693,0.837814,0.006168,7


### NAIVE BAYES MODEL

In [None]:
param_grid_nb = [{
    'model': [BernoulliNB()],
    'model__alpha': [0.1, 0.5, 1.0, 10.0, 100.0],
    'model__binarize': [None, 0.0, 0.5],
    'model__fit_prior': [True, False],
}]

grid_search_nb = GridSearchCV(pipe, param_grid=param_grid_nb, scoring=score, cv=50, refit='accuracy')
grid_search_nb.fit(X_train_vectorized, y_train)

In [71]:
print(grid_search_nb.best_estimator_)
print(grid_search_nb.best_score_)
print(grid_search_nb.best_params_)
pd.DataFrame(grid_search_nb.cv_results_).head()

Pipeline(steps=[('model', BernoulliNB(alpha=0.1, binarize=None))])
0.799917111717449
{'model': BernoulliNB(alpha=0.1, binarize=None), 'model__alpha': 0.1, 'model__binarize': None, 'model__fit_prior': True}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__alpha,param_model__binarize,param_model__fit_prior,params,split0_test_accuracy,...,split43_test_accuracy,split44_test_accuracy,split45_test_accuracy,split46_test_accuracy,split47_test_accuracy,split48_test_accuracy,split49_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.026347,0.005082,0.003196,0.001535,"BernoulliNB(alpha=0.1, binarize=None)",0.1,,True,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.778094,...,0.798009,0.793741,0.788051,0.790747,0.7879,0.796441,0.797865,0.799917,0.009211,1
1,0.020741,0.00191,0.002662,0.000621,"BernoulliNB(alpha=0.1, binarize=None)",0.1,,False,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.776671,...,0.792319,0.777383,0.770982,0.777936,0.785765,0.775801,0.793594,0.785237,0.009587,2
2,0.029128,0.003167,0.002681,0.000562,"BernoulliNB(alpha=0.1, binarize=None)",0.1,0.0,True,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.766003,...,0.774538,0.763158,0.758179,0.762989,0.770819,0.767972,0.779359,0.771125,0.009195,5
3,0.027492,0.000963,0.002773,0.000544,"BernoulliNB(alpha=0.1, binarize=None)",0.1,0.0,False,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.753201,...,0.767425,0.761024,0.751778,0.75516,0.767972,0.758007,0.773665,0.7635,0.009317,7
4,0.027358,0.000992,0.002763,0.000587,"BernoulliNB(alpha=0.1, binarize=None)",0.1,0.5,True,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.593172,...,0.592461,0.616643,0.59744,0.603559,0.572954,0.600712,0.607829,0.599693,0.012567,13


### RANDOM FOREST CLASSIFIER

In [None]:
param_grid_rf = [{
    'model': [RandomForestClassifier()],
    'model__n_estimators': [None, 100,200],
    'model__max_depth': [None],
}]

grid_search_rf = GridSearchCV(pipe, param_grid=param_grid_rf, scoring=score, cv=2,refit='accuracy')
grid_search_rf.fit(X_train_vectorized, y_train)

In [75]:
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_score_)
print(grid_search_rf.best_params_)
pd.DataFrame(grid_search_rf.cv_results_).head()

Pipeline(steps=[('model', RandomForestClassifier(n_estimators=200))])
0.8715005121201775
{'model': RandomForestClassifier(n_estimators=200), 'model__max_depth': None, 'model__n_estimators': 200}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__max_depth,param_model__n_estimators,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,386.265844,46.662919,3.674854,0.094804,RandomForestClassifier(n_estimators=200),,100,{'model': RandomForestClassifier(n_estimators=...,0.867987,0.866621,0.867304,0.000683,2
1,718.792021,5.883594,7.687275,0.180541,RandomForestClassifier(n_estimators=200),,200,{'model': RandomForestClassifier(n_estimators=...,0.872226,0.870775,0.871501,0.000726,1
2,3.045245,0.103063,0.28396,0.009419,RandomForestClassifier(n_estimators=200),5.0,100,{'model': RandomForestClassifier(n_estimators=...,0.420337,0.418175,0.419256,0.001081,3
3,5.654099,0.062845,0.541819,0.036461,RandomForestClassifier(n_estimators=200),5.0,200,{'model': RandomForestClassifier(n_estimators=...,0.419682,0.418317,0.419,0.000683,4


### GRADIENT BOOSTING CLASSIFIER

In [None]:
param_grid_gb = [{
    'model': [GradientBoostingClassifier()],
    'model__learning_rate': [0.1, 0.05],
    'model__n_estimators': [100, 200]
}]

grid_search_gb = GridSearchCV(pipe, param_grid=param_grid_gb, scoring=score, cv=2, refit='accuracy')
grid_search_gb.fit(X_train_vectorized, y_train)

In [78]:
print(grid_search_gb.best_estimator_)
print(grid_search_gb.best_score_)
print(grid_search_gb.best_params_)
pd.DataFrame(grid_search_gb.cv_results_).head()

Pipeline(steps=[('model', GradientBoostingClassifier(n_estimators=200))])
0.6089677933310573
{'model': GradientBoostingClassifier(n_estimators=200), 'model__learning_rate': 0.1, 'model__n_estimators': 200}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__learning_rate,param_model__n_estimators,params,split0_test_accuracy,split1_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,239.863303,0.696869,0.121361,0.005039,GradientBoostingClassifier(n_estimators=200),0.1,100,{'model': GradientBoostingClassifier(n_estimat...,0.578667,0.573774,0.576221,0.002447,2
1,497.805838,8.94846,0.303553,0.003573,GradientBoostingClassifier(n_estimators=200),0.1,200,{'model': GradientBoostingClassifier(n_estimat...,0.609793,0.608143,0.608968,0.000825,1
2,251.128856,2.699944,0.126179,0.001179,GradientBoostingClassifier(n_estimators=200),0.05,100,{'model': GradientBoostingClassifier(n_estimat...,0.543047,0.539433,0.54124,0.001807,4
3,524.144358,25.61556,0.291317,0.003318,GradientBoostingClassifier(n_estimators=200),0.05,200,{'model': GradientBoostingClassifier(n_estimat...,0.575424,0.573347,0.574385,0.001038,3


#### DICTIONARY FOR MODEL NAME WITH MODEL

In [80]:
model = {
    'LogisticRegression' : grid_search_lr,
    'Naive Bayes' : grid_search_nb,
    'Random Forest Classifier' : grid_search_rf,
    'Gradient Boosting Classifier' : grid_search_gb
}

#### PREDICTING ON THE TESTING DATA

In [83]:
print("Accuracy for different models on Test Data from splitting is :")
print('-'*70)
for grid_type, grid in model.items():
    y_predict = grid.predict(X_test_vectorized)
    print(f"Accuracy of {grid_type} is {accuracy_score(y_test, y_predict)*100} %")
print('-'*70)

Accuracy for different models on Test Data from splitting is :
----------------------------------------------------------------------
Accuracy of LogisticRegression is 84.70270270270271 %
Accuracy of Naive Bayes is 78.91891891891892 %
Accuracy of Random Forest Classifier is 92.7027027027027 %
Accuracy of Gradient Boosting Classifier is 60.75675675675676 %
----------------------------------------------------------------------


In [84]:
print("Accuracy for different models on Actual Testing Data is :")
print('-'*70)
for grid_type, grid in model.items():
    y_predict = grid.predict(X_testfinal_vectorized)
    print(f"Accuracy of {grid_type} is {accuracy_score(y_testfinal, y_predict)*100} %")
print('-'*70)

Accuracy for different models on Actual Testing Data is :
----------------------------------------------------------------------
Accuracy of LogisticRegression is 93.5 %
Accuracy of Naive Bayes is 87.0 %
Accuracy of Random Forest Classifier is 97.1 %
Accuracy of Gradient Boosting Classifier is 65.60000000000001 %
----------------------------------------------------------------------


#### THE FINAL HIGHEST ACCURACY IS 97.1 % FOR RANDOM FOREST CLASSIFIER MODEL 
#### ALTHOUGH AT TRAINING, IT ONLY HAD 87 % ACCURACY, BUT AS THE PREDICTION DATASET IS VERY SMALL, THE ACCURACY HAS BECOME HIGHER