In [220]:
import pandas as pd 
import numpy as np
import re
import preprocessor.api as p
from contractions import contractions_dict
from nltk.corpus import wordnet
import pattern
from pattern.en import suggest, lemma
import wordninja
from numpy import nan
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
from stopword import stopwords
from nltk.corpus import words
from tqdm._tqdm_notebook import tqdm_notebook
from spellchecker import SpellChecker
from ast import literal_eval
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors

In [259]:
def preprocessFeatures(data):
    data["Hashtags"] = data["OriginalTweet"].apply(lambda x: [', '.join(map(str, re.findall(r"#(\w+)", x))).lower()])
    data["HashtagsCount"] = data["OriginalTweet"].apply(lambda x: [len(re.findall(r"#(\w+)", x))])
    data["Mentions"] = data["OriginalTweet"].apply(lambda x: [len(re.findall(r"@(\w+)", x))])
    data["URLs"] = data["OriginalTweet"].apply(lambda x: [len(re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', x))])
    data["UpperCaseWords"] = data["OriginalTweet"].apply(lambda x: [len(re.findall(r"(\b[A-Z][A-Z]+|\b[A-Z]\b)", x))])
    data["PunctPeriod"] = data["OriginalTweet"].apply(lambda x: [len(re.findall(r"\.", x))])
    data["PunctExclamation"] = data["OriginalTweet"].apply(lambda x: [len(re.findall(r"\!", x))])
    data["PunctQuestion"] = data["OriginalTweet"].apply(lambda x: [len(re.findall(r"\?", x))])
    return data

# Word2Vec and general features
Predicting sentiment for tweets based on word2vec model and general features.

## Prepare trainingset

In [260]:
data = pd.read_csv("data/preprocess_train.csv", encoding = 'latin1') 
data = data.dropna()
data['tokens'] = data['tokens'].apply(literal_eval)

corpus = []
for item in data['tokens']:
    corpus.append(' '.join(item))

word2vec = KeyedVectors.load_word2vec_format('../MLworkshop Jeopardy/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)
#build tf-idf vector
tfidf = TfidfVectorizer(min_df=3)
tfidf.fit(corpus)
feature_names = tfidf.get_feature_names()

def get_ifidf_for_words(text):
    tfidf_matrix= tfidf.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)

def w2vmean(l):
    text = ' '.join(l)
    tf_idf = get_ifidf_for_words(text)
    X1 = np.zeros((300,))
    for x in l:
        if x in word2vec and x in tf_idf:
            X1 += word2vec[x] * tf_idf[x]
    return X1

data['tokens'] = data['tokens'].apply(lambda x: w2vmean(x))
data = preprocessFeatures(data)

In [261]:
data.head()

Unnamed: 0,OriginalTweet,Sentiment,tokens,Hashtags,HashtagsCount,Mentions,URLs,UpperCaseWords,PunctPeriod,PunctExclamation,PunctQuestion
1,advice Talk to your neighbours family to excha...,Positive,"[-0.045217821723781526, -0.12957120189093985, ...",[],[0],[0],[0],[1],[0],[0],[0]
2,Coronavirus Australia: Woolworths to give elde...,Positive,"[0.1090496089309454, 0.11484042098163627, 0.10...",[],[0],[0],[1],[1],[1],[0],[0]
3,My food stock is not the only one which is emp...,Positive,"[0.18580312211997807, -0.07340790703892708, 0....","[covid19france, covid_19, covid19, coronavirus...",[7],[0],[1],[11],[6],[0],[0]
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"[0.27922972617670894, -0.03418381605297327, 0....","[covid19, coronavirus, coronavirusfrance, rest...",[6],[0],[1],[2],[7],[0],[0]
5,As news of the regionÃÂs first confirmed COV...,Positive,"[0.04783004254568368, 0.059689379995688796, -0...",[],[0],[1],[1],[1],[1],[0],[0]


In [262]:
X = []
y = [] 

for i in range(len(data)):
    tok = data['tokens'].iloc[i]
    has = data['HashtagsCount'].iloc[i]
    men = data['Mentions'].iloc[i]
    url = data['URLs'].iloc[i]
    upc = data['UpperCaseWords'].iloc[i]
    per = data['PunctPeriod'].iloc[i]
    exc = data['PunctExclamation'].iloc[i]
    que = data['PunctQuestion'].iloc[i]
    row = np.concatenate([tok, has, men, url, upc, per, exc, que])
    X.append(row)
    y.append(data['Sentiment'].iloc[i])
    
print(X[0])
print(y[0])

[-0.04521782 -0.1295712  -0.03599979 -0.03202558 -0.13110606 -0.0385706
  0.06007127 -0.19903495  0.48167129  0.26953838 -0.06201406 -0.1218441
 -0.13611413  0.12963183 -0.67325786  0.39216037  0.19548158  0.2820546
 -0.01657487 -0.2635157   0.54904745  0.11781592 -0.13391594  0.15330124
 -0.01461942 -0.07108847 -0.5629061   0.31965716  0.09751459 -0.02628797
 -0.19790317 -0.36990661 -0.30469168 -0.36872784 -0.28543786 -0.28384049
  0.35927944  0.00246197 -0.04935208  0.29575096  0.02854061 -0.03263297
  0.06597441 -0.14169426 -0.29342194 -0.32867927 -0.23560393  0.13801583
 -0.35742032 -0.11407956 -0.14610504 -0.12540382 -0.05014944 -0.14147434
 -0.03050272 -0.02898542 -0.09951725 -0.15503716 -0.16825065 -0.12854678
 -0.17628692 -0.17272498 -0.31981235  0.11415113 -0.0162946   0.37618858
 -0.54127505  0.42038736  0.02666528  0.2927442   0.03846314  0.01600092
  0.63625876  0.11625102 -0.58021663  0.05137449  0.23431435  0.44318428
  0.24013518 -0.03901059  0.23346466 -0.1976378   0.10

## Prepare testset

In [263]:
testdata = pd.read_csv("data/preprocess_test_final.csv", encoding = 'latin1') 
testdata = testdata.dropna()
testdata['tokens'] = testdata['tokens'].apply(literal_eval)
testdata['tokens'] = testdata['tokens'].apply(lambda x: w2vmean(x))
testdata = preprocessFeatures(testdata)

In [264]:
testdata.head()

Unnamed: 0,OriginalTweet,Sentiment,sent_token,tokens,Hashtags,HashtagsCount,Mentions,URLs,UpperCaseWords,PunctPeriod,PunctExclamation,PunctQuestion
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,['TRENDING: New Yorkers encounter empty superm...,"[0.1975125118624419, 0.16406913893297315, -0.0...",[coronavirus],[1],[0],[2],[1],[2],[0],[0]
1,When I couldn't find hand sanitizer at Fred Me...,Positive,"[""When I couldn't find hand sanitizer at Fred ...","[0.012639568885788321, 0.09953883814159781, -0...","[amazon, coronavirus]",[2],[0],[1],[2],[4],[2],[2]
2,Find out how you can protect yourself and love...,Extremely Positive,['Find out how you can protect yourself and lo...,"[-0.01927915937267244, -0.0742998868227005, -0...",[coronavirus],[1],[0],[0],[0],[1],[0],[1]
3,#Panic buying hits #NewYork City as anxious sh...,Negative,['#Panic buying hits #NewYork City as anxious ...,"[-0.16155412141233683, 0.04214186384342611, -0...","[panic, newyork, healthcare, bigapple, coronav...",[11],[0],[2],[6],[2],[0],[1]
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,['#toiletpaper #dunnypaper #coronavirus #coron...,"[0.13522900408133864, -0.1849645283073187, -0....","[toiletpaper, dunnypaper, coronavirus, coronav...",[11],[0],[1],[0],[2],[0],[0]


In [265]:
X_test = []
y_test = [] 

for i in range(len(testdata)):
    tok = testdata['tokens'].iloc[i]
    has = testdata['HashtagsCount'].iloc[i]
    men = testdata['Mentions'].iloc[i]
    url = testdata['URLs'].iloc[i]
    upc = testdata['UpperCaseWords'].iloc[i]
    per = testdata['PunctPeriod'].iloc[i]
    exc = testdata['PunctExclamation'].iloc[i]
    que = testdata['PunctQuestion'].iloc[i]
    row = np.concatenate([tok, has, men, url, upc, per, exc, que])
    X_test.append(row)
    
#     HashtagsCount	Mentions	URLs	UpperCaseWords	PunctPeriod	PunctExclamation	PunctQuestion
    y_test.append(testdata['Sentiment'].iloc[i])
    
print(X_test[0])
print(y_test[0])

[ 0.19751251  0.16406914 -0.08077186  0.24873493 -0.2236984  -0.02667778
  0.03970695 -0.09416564  0.15056637  0.49147954 -0.18753959 -0.05954439
 -0.076067    0.14337692 -0.5960996   0.31317267 -0.04269212  0.1886496
  0.0173678  -0.46386353  0.24728703  0.14169849 -0.01813464 -0.04459166
 -0.05791089  0.01703206 -0.01445083  0.08936925  0.21961409 -0.13089258
 -0.02144561  0.07451568 -0.25947715 -0.14026356 -0.05995768  0.08417742
  0.14595186 -0.1311665   0.17901262  0.10232114  0.14483824  0.17584943
  0.21348609  0.03781949 -0.2827316  -0.25911544 -0.06880158  0.05365289
 -0.17094703  0.05027327  0.07664106  0.14587476 -0.19888585 -0.12783226
  0.12424986 -0.19563302  0.00774756 -0.18017244  0.2547958  -0.08037268
 -0.10337515  0.01341996 -0.24838854  0.14635911 -0.30849758  0.02157698
 -0.10823765  0.15720974  0.10605738  0.12602015 -0.07108232  0.17590671
  0.21743952  0.20612701 -0.39072273  0.02816121  0.13147974  0.32887982
  0.08348509  0.17350902  0.0487698  -0.27896093  0.

## Checking shapes of Train and Test set

In [266]:
print(len(X[0]))
print(len(X_test[0]))
print(len(X))
print(len(X_test))

307
307
41106
3795


## Model Training
Considering tf-idf
- Random forest
- Logistic Regression
- KNN


In [267]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X_test = scaler.fit_transform(X_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y)
scaled_y = encoder.transform(y)
y_test = encoder.transform(y_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Extremely Negative' 'Extremely Positive' 'Negative' 'Neutral' 'Positive']
[0 1 2 3 4]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  3.9min finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 50, 'n_estimators': 500}
Best parameters set found on development set:
[0.36742885 0.37873997 0.37873997 0.35429336 0.38165896 0.38798346
 0.35587448 0.38068596 0.38968621]
Best score: 
0.3896862077353442

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.3min finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.01}
Best parameters set found on development set:
[0.39211871 0.45451228 0.45524203 0.45171491 0.45183654]
Best score: 
0.4552420335684748

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.33799562 0.3623206  0.36755047 0.38032109 0.38567259]
Best score: 
0.3856725857455607



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 11.1min finished


## Model testing
Considering tf-idf

In [268]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF4 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log4 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.1)
final_knn4 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF4,"Random Forest")
test_result(final_log4,"Logistic Regression")
test_result(final_knn4,"KNN")

Random Forest
0.3965744400527009
             precision    recall  f1-score   support

          0       0.69      0.16      0.26       592
          1       0.71      0.21      0.33       599
          2       0.39      0.36      0.37      1041
          3       0.54      0.45      0.49       616
          4       0.32      0.67      0.43       947

avg / total       0.49      0.40      0.38      3795






Logistic Regression
0.4658761528326746
             precision    recall  f1-score   support

          0       0.56      0.44      0.49       592
          1       0.60      0.54      0.57       599
          2       0.44      0.34      0.38      1041
          3       0.47      0.66      0.55       616
          4       0.38      0.45      0.41       947

avg / total       0.47      0.47      0.46      3795


KNN
0.397364953886693
             precision    recall  f1-score   support

          0       0.50      0.37      0.43       592
          1       0.52      0.35      0.42       599
          2       0.38      0.36      0.37      1041
          3       0.39      0.61      0.48       616
          4       0.33      0.35      0.34       947

avg / total       0.41      0.40      0.40      3795




# General features only
Sentiment prediction of tweets based on general features only

## Prepare trainingset

In [269]:
X = []
y = [] 

for i in range(len(data)):
    has = data['HashtagsCount'].iloc[i]
    men = data['Mentions'].iloc[i]
    url = data['URLs'].iloc[i]
    upc = data['UpperCaseWords'].iloc[i]
    per = data['PunctPeriod'].iloc[i]
    exc = data['PunctExclamation'].iloc[i]
    que = data['PunctQuestion'].iloc[i]
    row = np.concatenate([has, men, url, upc, per, exc, que])
    X.append(row)
    y.append(data['Sentiment'].iloc[i])
    
print(X[0])
print(y[0])

[0 0 0 1 0 0 0]
Positive


## Prepare testset

In [270]:
X_test = []
y_test = [] 

for i in range(len(testdata)):
    has = testdata['HashtagsCount'].iloc[i]
    men = testdata['Mentions'].iloc[i]
    url = testdata['URLs'].iloc[i]
    upc = testdata['UpperCaseWords'].iloc[i]
    per = testdata['PunctPeriod'].iloc[i]
    exc = testdata['PunctExclamation'].iloc[i]
    que = testdata['PunctQuestion'].iloc[i]
    row = np.concatenate([has, men, url, upc, per, exc, que])
    X_test.append(row)
    
#     HashtagsCount	Mentions	URLs	UpperCaseWords	PunctPeriod	PunctExclamation	PunctQuestion
    y_test.append(testdata['Sentiment'].iloc[i])
    
print(X_test[0])
print(y_test[0])

[1 0 2 1 2 0 0]
Extremely Negative


## Check shape of Training and Test set

In [271]:
print(len(X[0]))
print(len(X_test[0]))
print(len(X))
print(len(X_test))

7
7
41106
3795


## Model training

In [272]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X_test = scaler.fit_transform(X_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y)
scaled_y = encoder.transform(y)
y_test = encoder.transform(y_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Extremely Negative' 'Extremely Positive' 'Negative' 'Neutral' 'Positive']
[0 1 2 3 4]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   52.2s finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 10, 'n_estimators': 200}
Best parameters set found on development set:
[0.27402092 0.28010216 0.27864267 0.25942593 0.26064218 0.26198005
 0.25711506 0.25966918 0.26185843]
Best score: 
0.2801021649233763

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    7.1s finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.1}
Best parameters set found on development set:
[0.27985892 0.28229141 0.28484554 0.28581854 0.28545366]
Best score: 
0.28581853563609827

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.21053272 0.22768183 0.22451958 0.2368037  0.25103381]
Best score: 
0.25103381172464123



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    4.2s finished


In [273]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF4 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log4 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.1)
final_knn4 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF4,"Random Forest")
test_result(final_log4,"Logistic Regression")
test_result(final_knn4,"KNN")

Random Forest
0.26455862977602107
             precision    recall  f1-score   support

          0       0.22      0.04      0.07       592
          1       0.21      0.08      0.12       599
          2       0.30      0.19      0.24      1041
          3       0.30      0.29      0.30       616
          4       0.25      0.58      0.35       947

avg / total       0.26      0.26      0.23      3795


Logistic Regression
0.25691699604743085
             precision    recall  f1-score   support

          0       0.62      0.01      0.02       592
          1       0.18      0.02      0.03       599
          2       0.00      0.00      0.00      1041
          3       0.31      0.18      0.23       616
          4       0.25      0.90      0.39       947

avg / total       0.24      0.26      0.14      3795




  'precision', 'predicted', average, warn_for)


KNN
0.2545454545454545
             precision    recall  f1-score   support

          0       0.20      0.13      0.16       592
          1       0.22      0.17      0.19       599
          2       0.27      0.30      0.28      1041
          3       0.29      0.24      0.26       616
          4       0.26      0.34      0.29       947

avg / total       0.25      0.25      0.25      3795




In [274]:
# Normalize sentiment

In [282]:
data_norm = data
data_norm.loc[data_norm.Sentiment == "Extremely Negative", "Sentiment"] = "Negative"
data_norm.loc[data_norm.Sentiment == "Extremely Positive", "Sentiment"] = "Positive"
display(data_norm.head())

testdata_norm = testdata
testdata_norm.loc[testdata_norm.Sentiment == "Extremely Negative", "Sentiment"] = "Negative"
testdata_norm.loc[testdata_norm.Sentiment == "Extremely Positive", "Sentiment"] = "Positive"
display(testdata_norm.head())

Unnamed: 0,OriginalTweet,Sentiment,tokens,Hashtags,HashtagsCount,Mentions,URLs,UpperCaseWords,PunctPeriod,PunctExclamation,PunctQuestion
1,advice Talk to your neighbours family to excha...,Positive,"[-0.045217821723781526, -0.12957120189093985, ...",[],[0],[0],[0],[1],[0],[0],[0]
2,Coronavirus Australia: Woolworths to give elde...,Positive,"[0.1090496089309454, 0.11484042098163627, 0.10...",[],[0],[0],[1],[1],[1],[0],[0]
3,My food stock is not the only one which is emp...,Positive,"[0.18580312211997807, -0.07340790703892708, 0....","[covid19france, covid_19, covid19, coronavirus...",[7],[0],[1],[11],[6],[0],[0]
4,"Me, ready to go at supermarket during the #COV...",Negative,"[0.27922972617670894, -0.03418381605297327, 0....","[covid19, coronavirus, coronavirusfrance, rest...",[6],[0],[1],[2],[7],[0],[0]
5,As news of the regionÃÂs first confirmed COV...,Positive,"[0.04783004254568368, 0.059689379995688796, -0...",[],[0],[1],[1],[1],[1],[0],[0]


Unnamed: 0,OriginalTweet,Sentiment,sent_token,tokens,Hashtags,HashtagsCount,Mentions,URLs,UpperCaseWords,PunctPeriod,PunctExclamation,PunctQuestion
0,TRENDING: New Yorkers encounter empty supermar...,Negative,['TRENDING: New Yorkers encounter empty superm...,"[0.1975125118624419, 0.16406913893297315, -0.0...",[coronavirus],[1],[0],[2],[1],[2],[0],[0]
1,When I couldn't find hand sanitizer at Fred Me...,Positive,"[""When I couldn't find hand sanitizer at Fred ...","[0.012639568885788321, 0.09953883814159781, -0...","[amazon, coronavirus]",[2],[0],[1],[2],[4],[2],[2]
2,Find out how you can protect yourself and love...,Positive,['Find out how you can protect yourself and lo...,"[-0.01927915937267244, -0.0742998868227005, -0...",[coronavirus],[1],[0],[0],[0],[1],[0],[1]
3,#Panic buying hits #NewYork City as anxious sh...,Negative,['#Panic buying hits #NewYork City as anxious ...,"[-0.16155412141233683, 0.04214186384342611, -0...","[panic, newyork, healthcare, bigapple, coronav...",[11],[0],[2],[6],[2],[0],[1]
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,['#toiletpaper #dunnypaper #coronavirus #coron...,"[0.13522900408133864, -0.1849645283073187, -0....","[toiletpaper, dunnypaper, coronavirus, coronav...",[11],[0],[1],[0],[2],[0],[0]


In [283]:
X = []
y = [] 

for i in range(len(data_norm)):
    tok = data_norm['tokens'].iloc[i]
    has = data_norm['HashtagsCount'].iloc[i]
    men = data_norm['Mentions'].iloc[i]
    url = data_norm['URLs'].iloc[i]
    upc = data_norm['UpperCaseWords'].iloc[i]
    per = data_norm['PunctPeriod'].iloc[i]
    exc = data_norm['PunctExclamation'].iloc[i]
    que = data_norm['PunctQuestion'].iloc[i]
    row = np.concatenate([tok, has, men, url, upc, per, exc, que])
    X.append(row)
    y.append(data_norm['Sentiment'].iloc[i])
    
print(X[0])
print(y[0])

[-0.04521782 -0.1295712  -0.03599979 -0.03202558 -0.13110606 -0.0385706
  0.06007127 -0.19903495  0.48167129  0.26953838 -0.06201406 -0.1218441
 -0.13611413  0.12963183 -0.67325786  0.39216037  0.19548158  0.2820546
 -0.01657487 -0.2635157   0.54904745  0.11781592 -0.13391594  0.15330124
 -0.01461942 -0.07108847 -0.5629061   0.31965716  0.09751459 -0.02628797
 -0.19790317 -0.36990661 -0.30469168 -0.36872784 -0.28543786 -0.28384049
  0.35927944  0.00246197 -0.04935208  0.29575096  0.02854061 -0.03263297
  0.06597441 -0.14169426 -0.29342194 -0.32867927 -0.23560393  0.13801583
 -0.35742032 -0.11407956 -0.14610504 -0.12540382 -0.05014944 -0.14147434
 -0.03050272 -0.02898542 -0.09951725 -0.15503716 -0.16825065 -0.12854678
 -0.17628692 -0.17272498 -0.31981235  0.11415113 -0.0162946   0.37618858
 -0.54127505  0.42038736  0.02666528  0.2927442   0.03846314  0.01600092
  0.63625876  0.11625102 -0.58021663  0.05137449  0.23431435  0.44318428
  0.24013518 -0.03901059  0.23346466 -0.1976378   0.10

In [284]:
X_test = []
y_test = [] 

for i in range(len(testdata_norm)):
    tok = testdata_norm['tokens'].iloc[i]
    has = testdata_norm['HashtagsCount'].iloc[i]
    men = testdata_norm['Mentions'].iloc[i]
    url = testdata_norm['URLs'].iloc[i]
    upc = testdata_norm['UpperCaseWords'].iloc[i]
    per = testdata_norm['PunctPeriod'].iloc[i]
    exc = testdata_norm['PunctExclamation'].iloc[i]
    que = testdata_norm['PunctQuestion'].iloc[i]
    row = np.concatenate([tok, has, men, url, upc, per, exc, que])
    X_test.append(row)
    
#     HashtagsCount	Mentions	URLs	UpperCaseWords	PunctPeriod	PunctExclamation	PunctQuestion
    y_test.append(testdata_norm['Sentiment'].iloc[i])
    
print(X_test[0])
print(y_test[0])

[ 0.19751251  0.16406914 -0.08077186  0.24873493 -0.2236984  -0.02667778
  0.03970695 -0.09416564  0.15056637  0.49147954 -0.18753959 -0.05954439
 -0.076067    0.14337692 -0.5960996   0.31317267 -0.04269212  0.1886496
  0.0173678  -0.46386353  0.24728703  0.14169849 -0.01813464 -0.04459166
 -0.05791089  0.01703206 -0.01445083  0.08936925  0.21961409 -0.13089258
 -0.02144561  0.07451568 -0.25947715 -0.14026356 -0.05995768  0.08417742
  0.14595186 -0.1311665   0.17901262  0.10232114  0.14483824  0.17584943
  0.21348609  0.03781949 -0.2827316  -0.25911544 -0.06880158  0.05365289
 -0.17094703  0.05027327  0.07664106  0.14587476 -0.19888585 -0.12783226
  0.12424986 -0.19563302  0.00774756 -0.18017244  0.2547958  -0.08037268
 -0.10337515  0.01341996 -0.24838854  0.14635911 -0.30849758  0.02157698
 -0.10823765  0.15720974  0.10605738  0.12602015 -0.07108232  0.17590671
  0.21743952  0.20612701 -0.39072273  0.02816121  0.13147974  0.32887982
  0.08348509  0.17350902  0.0487698  -0.27896093  0.

In [285]:
print(len(X[0]))
print(len(X_test[0]))
print(len(X))
print(len(X_test))

307
307
41106
3795


In [286]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X_test = scaler.fit_transform(X_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y)
scaled_y = encoder.transform(y)
y_test = encoder.transform(y_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Negative' 'Neutral' 'Positive']
[0 1 2]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  3.7min finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 50, 'n_estimators': 500}
Best parameters set found on development set:
[0.57115057 0.58197519 0.58659694 0.5649477  0.59365118 0.59681343
 0.5649477  0.59243493 0.59754318]
Best score: 
0.5975431768426174

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.1min finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.01}
Best parameters set found on development set:
[0.61092192 0.65166626 0.65872051 0.65774751 0.65799076]
Best score: 
0.6587205059596205

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.54183897 0.55631233 0.57650207 0.58270494 0.59413768]
Best score: 
0.5941376793967404



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 11.0min finished


In [287]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF4 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log4 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.1)
final_knn4 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF4,"Random Forest")
test_result(final_log4,"Logistic Regression")
test_result(final_knn4,"KNN")

Random Forest
0.6184453227931489
             precision    recall  f1-score   support

          0       0.71      0.56      0.63      1633
          1       0.72      0.25      0.37       616
          2       0.56      0.82      0.66      1546

avg / total       0.65      0.62      0.60      3795






Logistic Regression
0.6693017127799736
             precision    recall  f1-score   support

          0       0.73      0.64      0.68      1633
          1       0.57      0.54      0.56       616
          2       0.65      0.75      0.70      1546

avg / total       0.67      0.67      0.67      3795


KNN
0.6050065876152833
             precision    recall  f1-score   support

          0       0.66      0.62      0.64      1633
          1       0.46      0.54      0.49       616
          2       0.62      0.62      0.62      1546

avg / total       0.61      0.61      0.61      3795




In [288]:
X = []
y = [] 

for i in range(len(data_norm)):
    has = data_norm['HashtagsCount'].iloc[i]
    men = data_norm['Mentions'].iloc[i]
    url = data_norm['URLs'].iloc[i]
    upc = data_norm['UpperCaseWords'].iloc[i]
    per = data_norm['PunctPeriod'].iloc[i]
    exc = data_norm['PunctExclamation'].iloc[i]
    que = data_norm['PunctQuestion'].iloc[i]
    row = np.concatenate([has, men, url, upc, per, exc, que])
    X.append(row)
    y.append(data_norm['Sentiment'].iloc[i])
    
print(X[0])
print(y[0])

[0 0 0 1 0 0 0]
Positive


In [289]:
X_test = []
y_test = [] 

for i in range(len(testdata_norm)):
    has = testdata_norm['HashtagsCount'].iloc[i]
    men = testdata_norm['Mentions'].iloc[i]
    url = testdata_norm['URLs'].iloc[i]
    upc = testdata_norm['UpperCaseWords'].iloc[i]
    per = testdata_norm['PunctPeriod'].iloc[i]
    exc = testdata_norm['PunctExclamation'].iloc[i]
    que = testdata_norm['PunctQuestion'].iloc[i]
    row = np.concatenate([has, men, url, upc, per, exc, que])
    X_test.append(row)
    
#     HashtagsCount	Mentions	URLs	UpperCaseWords	PunctPeriod	PunctExclamation	PunctQuestion
    y_test.append(testdata_norm['Sentiment'].iloc[i])
    
print(X_test[0])
print(y_test[0])

[1 0 2 1 2 0 0]
Negative


In [290]:
print(len(X[0]))
print(len(X_test[0]))
print(len(X))
print(len(X_test))

7
7
41106
3795


In [291]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X_test = scaler.fit_transform(X_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y)
scaled_y = encoder.transform(y)
y_test = encoder.transform(y_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Negative' 'Neutral' 'Positive']
[0 1 2]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   48.6s finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 10, 'n_estimators': 500}
Best parameters set found on development set:
[0.43006568 0.43298468 0.43529555 0.41778156 0.41997081 0.42483581
 0.41875456 0.42264656 0.42422768]
Best score: 
0.4352955485283386

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    4.3s finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.01}
Best parameters set found on development set:
[0.4374848  0.43772805 0.44003892 0.44003892 0.44003892]
Best score: 
0.44003891997081

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.3979567  0.38774021 0.4003892  0.40452445 0.41838969]
Best score: 
0.41838968620773537



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    3.9s finished


In [292]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF4 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log4 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.1)
final_knn4 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF4,"Random Forest")
test_result(final_log4,"Logistic Regression")
test_result(final_knn4,"KNN")

Random Forest
0.4276679841897233
             precision    recall  f1-score   support

          0       0.45      0.35      0.39      1633
          1       0.34      0.09      0.14       616
          2       0.42      0.64      0.51      1546

avg / total       0.42      0.43      0.40      3795


Logistic Regression
0.40869565217391307
             precision    recall  f1-score   support

          0       0.54      0.00      0.01      1633
          1       0.40      0.04      0.07       616
          2       0.41      0.98      0.58      1546

avg / total       0.46      0.41      0.25      3795


KNN
0.42845849802371544
             precision    recall  f1-score   support

          0       0.44      0.48      0.46      1633
          1       0.38      0.12      0.18       616
          2       0.43      0.50      0.46      1546

avg / total       0.42      0.43      0.41      3795


