# PROJECT-1 SENTIMENT ANALYSIS OF TWITTER TWEETS

#### IMPORTING NLTK FOR NLP

In [63]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import re

#### LOADING THE TRAINING AND TESTING DATASETS

In [64]:
df_train = pd.read_csv("twitter_training.csv", names=['Tweet ID','Game Name','Label','Text'])
df_test = pd.read_csv("twitter_validation.csv", names=['Tweet ID','Game Name','Label','Text'])

In [65]:
df_train.head()

Unnamed: 0,Tweet ID,Game Name,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [66]:
df_test.head()

Unnamed: 0,Tweet ID,Game Name,Label,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [67]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Tweet ID   74682 non-null  int64 
 1   Game Name  74682 non-null  object
 2   Label      74682 non-null  object
 3   Text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [68]:
df_train.isna().sum()

Tweet ID       0
Game Name      0
Label          0
Text         686
dtype: int64

In [69]:
df_test.isna().sum()

Tweet ID     0
Game Name    0
Label        0
Text         0
dtype: int64

#### DROPPING NA VALUES

In [70]:
# it is not possible to fill up empty tweets, so lets drop these rows
df_train.dropna(axis=0, inplace=True)
df_train.isna().sum()

Tweet ID     0
Game Name    0
Label        0
Text         0
dtype: int64

In [71]:
df_train.shape

(73996, 4)

In [72]:
df_test.shape

(1000, 4)

#### DROPPING USELESS COLUMNS

In [73]:
# DROP USELESS COLUMNS
df_train.drop(columns=['Tweet ID','Game Name'], inplace=True)
df_test.drop(columns=['Tweet ID','Game Name'], inplace=True)

#### GROUPING THE DATA ON LABEL VALUES

##### FOR TRAINING DATA

In [74]:
df_train['Label'].value_counts()

Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: Label, dtype: int64

In [75]:
# SINCE WE REGARD IRRELEVANT CLASSES AS NEUTRAL, LETS CHANGE SENTIMENT TO NEUTRAL FOR THEM
df_train['Label'].replace(['Irrelevant'],['Neutral'], inplace=True)
df_train['Label'].value_counts()

Neutral     30983
Negative    22358
Positive    20655
Name: Label, dtype: int64

##### FOR TESTING DATA

In [76]:
df_test['Label'].value_counts()

Neutral       285
Positive      277
Negative      266
Irrelevant    172
Name: Label, dtype: int64

In [77]:
# REPLACING IRRELEVANT WITH NEUTRAL
df_test['Label'].replace(['Irrelevant'],['Neutral'], inplace=True)
df_test['Label'].value_counts()

Neutral     457
Positive    277
Negative    266
Name: Label, dtype: int64

#### CREATING A FUNCTION TO PREPROCESS TEXT IN TEXT COLUMN

In [78]:
def preprocess(text):
    # string and regex preprocessing

    # converting text to lower
    text = text.lower()

    # removing only digits
    text = re.sub('\d','',text)

    # removing @ words
    text = re.sub(r'@\s?\w+','',text)

    # removing links from text
    text = re.sub(r'\spic\.twitter\.com\s*/\s*\S+','',text)
    text = re.sub(r"dlvr\.it\s*/\s*\S+|dfr\.it\s*/\s*\S+", "", text)
    text = re.sub(r'\spic\.wikipedia\.org\s*/\s*\S+','',text)
    text = re.sub('https?://\S+|www\.\S+','',text)

    # removing special characters
    text = re.sub('\[.*?\,]','',text)

    # remove punctuation
    text = re.sub('[^a-zA-Z0-9\s]+', '', text)

    # remove a9c type words (words containing numbers)
    text = re.sub('\w\d\w','',text)

    # nltk preprocessing
    words = word_tokenize(text)

    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    for w in words:
        if w not in stop_words:
            # stemming the words
            w = ps.stem(w)
            text = ' '.join(words)
        
    # removing extra whitespaces if any
    text = re.sub('\s+',' ',text).strip()

    return text

## PREPROCESSING THE DATA

In [79]:
df_train['Text'] = df_train['Text'].apply(preprocess)
df_test['Text'] = df_test['Text'].apply(preprocess)

In [80]:
df_train.head()

Unnamed: 0,Label,Text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,i am coming to the borders and i will kill you...
2,Positive,im getting on borderlands and i will kill you all
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands and i will murder yo...


In [81]:
df_test.head()

Unnamed: 0,Label,Text
0,Neutral,i mentioned on facebook that i was struggling ...
1,Neutral,bbc news amazon boss jeff bezos rejects claims...
2,Negative,why do i pay for word when it functions so poo...
3,Negative,csgo matchmaking is so full of closet hacking ...
4,Neutral,now the president is slapping americans in the...


#### ONE HOT ENCODING MANUALLY FOR THE LABEL VALUES

In [82]:
df_train['Label'].replace(['Neutral','Positive','Negative'],[0, 1, -1], inplace=True)
df_test['Label'].replace(['Neutral','Positive','Negative'],[0, 1, -1], inplace=True)

In [83]:
df_train["Label"].unique()

array([ 1,  0, -1], dtype=int64)

In [84]:
df_test["Label"].unique()

array([ 0, -1,  1], dtype=int64)

#### IMPORTING NECESSARY LIBRARIES

In [85]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#### DEFINING THE TRAINING AND TESTING DATASETS

In [86]:
X = df_train['Text']
y = df_train["Label"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.05, random_state=1)

X_testfinal = df_test['Text']
y_testfinal = df_test['Label']

#### CONVERTING TEXT COLUMN INTO NUMERICAL FORMAT FOR MACHINE LEARNING

In [87]:
vector = TfidfVectorizer().fit(X)

X_train_vectorized = vector.transform(X_train)
X_test_vectorized = vector.transform(X_test)
X_testfinal_vectorized = vector.transform(X_testfinal)

## CREATING A PIPELINE AND SCORING METHOD

In [88]:
pipe = Pipeline([
    ('model', None)
])
score = ['accuracy']

### LOGISTIC REGRESSION MODEL

In [59]:
param_grid_lr = [{
    'model' : [LogisticRegression()],
    'model__penalty': ['l1', 'l2'],  
    'model__C': [10,20,30,40,100],  
    'model__solver': ['liblinear'] 
}]

grid_search_lr = GridSearchCV(pipe, param_grid=param_grid_lr, scoring=score, cv=30, refit='accuracy')
grid_search_lr.fit(X_train_vectorized, y_train)

In [60]:
print(grid_search_lr.best_estimator_)
print(grid_search_lr.best_score_)
print(grid_search_lr.best_params_)
pd.DataFrame(grid_search_lr.cv_results_).head()

Pipeline(steps=[('model', LogisticRegression(C=100, solver='liblinear'))])
0.8641885494370712
{'model': LogisticRegression(C=100, solver='liblinear'), 'model__C': 100, 'model__penalty': 'l2', 'model__solver': 'liblinear'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__C,param_model__penalty,param_model__solver,params,split0_test_accuracy,...,split23_test_accuracy,split24_test_accuracy,split25_test_accuracy,split26_test_accuracy,split27_test_accuracy,split28_test_accuracy,split29_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,2.856523,0.29388,0.000853,0.000748,"LogisticRegression(C=100, solver='liblinear')",10,l1,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.847696,...,0.847204,0.849338,0.852326,0.855314,0.854033,0.849338,0.850619,0.855482,0.005696,5
1,4.96431,0.43156,0.002391,0.002601,"LogisticRegression(C=100, solver='liblinear')",10,l2,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.836604,...,0.851472,0.850192,0.851472,0.854887,0.84251,0.848058,0.843363,0.851727,0.006783,8
2,3.482688,0.397545,0.001565,0.00123,"LogisticRegression(C=100, solver='liblinear')",20,l1,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.850683,...,0.846351,0.84507,0.851046,0.852753,0.853606,0.849765,0.846351,0.854885,0.005895,6
3,5.895729,0.532427,0.002264,0.004685,"LogisticRegression(C=100, solver='liblinear')",20,l2,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.848123,...,0.855314,0.862143,0.859155,0.860862,0.85446,0.856594,0.853606,0.859693,0.005945,4
4,3.784069,0.561873,0.001072,0.001291,"LogisticRegression(C=100, solver='liblinear')",30,l1,liblinear,"{'model': LogisticRegression(C=100, solver='li...",0.853242,...,0.839522,0.84251,0.850619,0.848912,0.849765,0.847204,0.846778,0.853135,0.006472,7


### NAIVE BAYES MODEL

In [57]:
param_grid_nb = [{
    'model': [BernoulliNB()],
    'model__alpha': [0.1, 0.5, 1.0, 10.0, 100.0],
    'model__binarize': [None, 0.0, 0.5],
    'model__fit_prior': [True, False],
}]

grid_search_nb = GridSearchCV(pipe, param_grid=param_grid_nb, scoring=score, cv=50, refit='accuracy')
grid_search_nb.fit(X_train_vectorized, y_train)

In [58]:
print(grid_search_nb.best_estimator_)
print(grid_search_nb.best_score_)
print(grid_search_nb.best_params_)
pd.DataFrame(grid_search_nb.cv_results_).head()

Pipeline(steps=[('model', BernoulliNB(alpha=0.1, binarize=None))])
0.8120801142029838
{'model': BernoulliNB(alpha=0.1, binarize=None), 'model__alpha': 0.1, 'model__binarize': None, 'model__fit_prior': True}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__alpha,param_model__binarize,param_model__fit_prior,params,split0_test_accuracy,...,split43_test_accuracy,split44_test_accuracy,split45_test_accuracy,split46_test_accuracy,split47_test_accuracy,split48_test_accuracy,split49_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy
0,0.028659,0.004174,0.003849,0.001202,"BernoulliNB(alpha=0.1, binarize=None)",0.1,,True,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.799431,...,0.815789,0.818634,0.807966,0.809253,0.814235,0.804982,0.803559,0.81208,0.008758,1
1,0.028776,0.004473,0.004196,0.001939,"BernoulliNB(alpha=0.1, binarize=None)",0.1,,False,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.791607,...,0.802276,0.799431,0.782361,0.802135,0.804982,0.785053,0.792883,0.79757,0.008655,2
2,0.043633,0.007755,0.00416,0.001097,"BernoulliNB(alpha=0.1, binarize=None)",0.1,0.0,True,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.773826,...,0.780228,0.766714,0.751067,0.772954,0.778648,0.755872,0.754448,0.768863,0.010746,6
3,0.039206,0.00578,0.004041,0.002987,"BernoulliNB(alpha=0.1, binarize=None)",0.1,0.0,False,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.762447,...,0.773115,0.761735,0.741821,0.768683,0.777224,0.747331,0.748043,0.761764,0.010356,8
4,0.031765,0.006625,0.006776,0.015193,"BernoulliNB(alpha=0.1, binarize=None)",0.1,0.5,True,"{'model': BernoulliNB(alpha=0.1, binarize=None...",0.554054,...,0.55192,0.581081,0.554054,0.56726,0.545907,0.570107,0.570819,0.562607,0.012427,13


### RANDOM FOREST CLASSIFIER

In [90]:
param_grid_rf = [{
    'model': [RandomForestClassifier()],
    #'model__n_estimators': [100, 200, 500],
    #'model__max_depth': [None, 5, 10],
}]

grid_search_rf = GridSearchCV(pipe, param_grid=param_grid_rf, scoring=score, cv=5, verbose=3,refit='accuracy')
grid_search_rf.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END model=RandomForestClassifier(); accuracy: (test=0.908) total time= 9.5min
[CV 2/5] END model=RandomForestClassifier(); accuracy: (test=0.915) total time= 9.5min
[CV 3/5] END model=RandomForestClassifier(); accuracy: (test=0.910) total time=10.0min
[CV 4/5] END model=RandomForestClassifier(); accuracy: (test=0.910) total time= 9.3min


In [None]:
print(grid_search_rf.best_estimator_)
print(grid_search_rf.best_score_)
print(grid_search_rf.best_params_)
pd.DataFrame(grid_search_rf.cv_results_).head()

### GRADIENT BOOSTING CLASSIFIER

In [None]:
param_grid_gb = [{
    'model': [GradientBoostingClassifier()],
    'model__learning_rate': [0.1, 0.05],
    'model__n_estimators': [100, 200, 500]
}]

grid_search_gb = GridSearchCV(pipe, param_grid=param_grid_gb, scoring=score, cv=30, refit='accuracy')
grid_search_gb.fit(X_train_vectorized, y_train)

In [None]:
print(grid_search_gb.best_estimator_)
print(grid_search_gb.best_score_)
print(grid_search_gb.best_params_)
pd.DataFrame(grid_search_gb.cv_results_).head()

#### DICTIONARY FOR MODEL NAME WITH MODEL

In [None]:
model = {
    'LogisticRegression' : grid_search_lr,
    'Support Vector Machine' : grid_search_svm,
    'Naive Bayes' : grid_search_nb,
    'Random Forest Classifier' : grid_search_rf,
    'Gradient Boosting Classifier' : grid_search_gb
}

#### PREDICTING ON THE TESTING DATA

In [None]:
print("Accuracy for different models is :")
print('-'*30)
for grid_type, grid in model.items():
    y_predict = grid.predict(X_test)
    print(f"Accuracy of {grid_type} is {accuracy_score(y_test, y_predict)*100} %")
print('-'*30)