In [7]:
import pandas as pd 

data = pd.read_csv("Corona_NLP_train.csv", encoding = 'latin1') 
test_data = pd.read_csv("Corona_NLP_test.csv", encoding = 'latin1') 
test_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [8]:
#data = data.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis = 1)
test_data = test_data.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis = 1)
test_data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Extremely Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [12]:
import re
import preprocessor.api as p
from contractions import contractions_dict
from nltk.corpus import wordnet
import pattern
from pattern.en import suggest, lemma
import wordninja
from numpy import nan
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
from stopword import stopwords
from nltk.corpus import words
from tqdm._tqdm_notebook import tqdm_notebook
from spellchecker import SpellChecker

def remove_nonASCII(text):
    #remove non english alphabet
    return re.sub(r'[^\x00-\x7f]',"", text)

def remove_repeated_characters(word):
    pattern = re.compile(r"(\w*)(\w)\2(\w*)")
    substitution_pattern = r"\1\2\3"
    while True:
        if wordnet.synsets(word):
            return word
        new_word = pattern.sub(substitution_pattern,word)
        if new_word != word:
            word = new_word
            continue
        else:
            return new_word

def expand_contractions(text):
    pattern = re.compile("({})".format("|".join(contractions_dict.keys())),flags = re.DOTALL| re.IGNORECASE)
    
    def replace_text(t):
        txt = t.group(0)
        if txt in contractions_dict.keys():
            return contractions_dict[txt]
        
    expand_text = pattern.sub(replace_text,text)
    return expand_text 

def spelling_checker(word):
    checker = suggest(word)
    return checker[0][0]

def remove_stopwords(w):
    text = []
    for word in w:
        if word not in stopwords:
              text.append(word)
    return text

def split_words(words):
    #split words which are not separated by space
    wlist = []
    for word in words:
        # 12 is decided based on distribution of word length in English
        if len(word)> 12:
            wlist = wlist + wordninja.split(word)
        else:
            wlist.append(word)
    return wlist


def preprocess_unit(text):
    text = remove_nonASCII(text)
    text = re.sub('\S+@\S*\s?', ' ', text) #remove email
    text = p.clean(text)    # remove URL, hashtag, @-mention, emojis
    text = re.sub(r'[^\w\s]', ' ', text) #remove punctuations
    text = re.sub(r'\_','',text) #remove underscore
    text = re.sub(r'[0-9]*', '', text) #remove digits
    text = text.lower() #to lower case
    text = expand_contractions(text) #expand contraction
    Words = word_tokenize(text) #tokenization
    filtered_words = remove_stopwords(Words) #remove stopwords
    if len(filtered_words) > 0:
        """
        for word in filtered_words:
            if word in words.words():
                output.append(word)
            else:
                output.append(remove_repeated_characters(word))
        """
        #output = split_words(output)
        #output = [spelling_checker(s) for s in filtered_words]
        output = [lemma(s) for s in filtered_words]
        return output
    else:
        return nan

def sent_preprocess(sentences):
    result = []
    for sentence in sentences:
        #s = TextBlob(sentence)
        #sentence = s.correct()
        temp = preprocess_unit(sentence)
        if temp is not nan:
            result.append(temp)
    #print(result)
    return result

#tqdm_notebook.pandas()
"""
data['sent_token'] = data['OriginalTweet'].apply(sent_tokenize)
data['toks'] = data['sent_token'].progress_apply(sent_preprocess)
data.head()
"""
#test_data['sent_token'] = test_data['OriginalTweet'].apply(sent_tokenize)
test_data['tokens'] = test_data['OriginalTweet'].apply(preprocess_unit)
#test_data.head()
#data['tokens'] = data['OriginalTweet'].progress_apply(preprocess_unit)
#data.head()
test_data.to_csv("preprocess_test_final.csv", index = False, header = True)

# training self-word2vec model

In [5]:

import pandas as pd 
from ast import literal_eval
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import numpy as np

data = pd.read_csv("preprocess_train_final.csv", encoding = 'latin1') 
data['toks'] = data['toks'].apply(literal_eval)
data = data[data['toks'].map(len)>0]
# define training data
sentences = []
for item in data['toks']:
    for sub in item:
        sentences.append(sub)

model_2 = Word2Vec(size=300, min_count=1)
model_2.build_vocab(sentences)
total_examples = model_2.corpus_count
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True, limit=10 ** 5)
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True, lockf=1.0)
model_2.train(sentences, total_examples=total_examples, epochs=model_2.iter)
#print(model_2['corona'])
model_2.save('model.bin')

# load model
#new_model = Word2Vec.load('model.bin')
#print(new_model)


'\nimport pandas as pd \nfrom ast import literal_eval\nfrom gensim.models import Word2Vec\nfrom gensim.models import KeyedVectors\nimport numpy as np\n\ndata = pd.read_csv("preprocess_train_final.csv", encoding = \'latin1\') \ndata[\'toks\'] = data[\'toks\'].apply(literal_eval)\ndata = data[data[\'toks\'].map(len)>0]\n# define training data\nsentences = []\nfor item in data[\'toks\']:\n    for sub in item:\n        sentences.append(sub)\n\nmodel_2 = Word2Vec(size=300, min_count=1)\nmodel_2.build_vocab(sentences)\ntotal_examples = model_2.corpus_count\nmodel = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True, limit=10 ** 5)\nmodel_2.build_vocab([list(model.vocab.keys())], update=True)\nmodel_2.intersect_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True, lockf=1.0)\nmodel_2.train(sentences, total_examples=total_examples, epochs=model_2.iter)\n#print(model_2[\'corona\'])\nmodel_2.save(\'model.bin\')\n\n# load model\nnew_model = Word

# extract vectors for tweets (without considering tfidf)
number of tweets: 41106

In [24]:
import pandas as pd 
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np

data = pd.read_csv("preprocess_train.csv", encoding = 'latin1') 
data = data.dropna()
data['tokens'] = data['tokens'].apply(literal_eval)
corpus = []
for item in data['tokens']:
    corpus.append(' '.join(item))
    
#convert sentiment to numerical data
def sentiment_to_numeric(tweet):
    conversion = {'Extremely Positive': 1, 'Positive':2, 'Neutral':3, 'Negative':4, 'Extremely Negative': 5}
    tweet = conversion[tweet]
    return tweet
#data['sentiment_numer'] = data['Sentiment'].apply(sentiment_to_numeric)
#data.head()

word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)
#build tf-idf vector
#tfidf = TfidfVectorizer(min_df=3)
#tfidf.fit(corpus)
#feature_names = tfidf.get_feature_names()

def get_ifidf_for_words(text):
    tfidf_matrix= tfidf.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)
"""
def w2vmean(l):
    #text = ' '.join(l)
    #tf_idf = get_ifidf_for_words(text)
    #print(tf_idf)
    X1 = np.zeros((300,))
    for x in l:
        if x in word2vec and x in tf_idf:
            X1 += word2vec[x] * tf_idf[x]
    return X1
"""
def w2vmean(l):
    X = np.zeros((300,))
    for x in l:
        if x in word2vec:
            X += word2vec[x]
    return X/len(l)

X = []
y = []
for i in range(len(data)):
    X.append(w2vmean(data['tokens'].iloc[i]))
    y.append(data['Sentiment'].iloc[i])


# extract word vectors for test data

In [25]:
import pandas as pd 
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np

data = pd.read_csv("preprocess_test_final.csv", encoding = 'latin1') 
data = data.dropna()
data['tokens'] = data['tokens'].apply(literal_eval)
corpus = []
for item in data['tokens']:
    corpus.append(' '.join(item))

word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)
#build tf-idf vector
#tfidf = TfidfVectorizer(min_df=3)
#tfidf.fit(corpus)
#feature_names = tfidf.get_feature_names()

def get_ifidf_for_words(text):
    tfidf_matrix= tfidf.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)
"""
def w2vmean(l):
    #text = ' '.join(l)
    #tf_idf = get_ifidf_for_words(text)
    #print(tf_idf)
    X1 = np.zeros((300,))
    for x in l:
        if x in word2vec and x in tf_idf:
            X1 += word2vec[x] * tf_idf[x]
    return X1
"""
def w2vmean(l):
    X = np.zeros((300,))
    for x in l:
        if x in word2vec:
            X += word2vec[x]
    return X/len(l)

X_test = []
y_test = []
for i in range(len(data)):
    X_test.append(w2vmean(data['tokens'].iloc[i]))
    y_test.append(data['Sentiment'].iloc[i])

# Model Training (pre-trained one without considering TFIDF)
- Random forest
- Logistic Regression
- KNN

In [26]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X_test = scaler.fit_transform(X_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y)
scaled_y = encoder.transform(y)
y_test = encoder.transform(y_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Extremely Negative' 'Extremely Positive' 'Negative' 'Neutral' 'Positive']
[0 1 2 3 4]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  4.4min finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 50, 'n_estimators': 500}
Best parameters set found on development set:
[0.35696911 0.36803697 0.37594259 0.36706397 0.39284846 0.40647045
 0.37168572 0.39710533 0.40720019]
Best score: 
0.40720019459985407

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.2min finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.01}
Best parameters set found on development set:
[0.3628071  0.42203843 0.4340793  0.43371442 0.4335928 ]
Best score: 
0.4340792994405254

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.2983459  0.32802238 0.33130625 0.34541474 0.35100949]
Best score: 
0.35100948674288496



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.1min finished


# Model Test

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF = RandomForestClassifier(max_depth=50,n_estimators=500, random_state=1, n_jobs=-1)
final_log = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.01)
final_knn = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF,"Random Forest")
test_result(final_log,"Logistic Regression")
test_result(final_knn,"KNN")

Random Forest
0.4129117259552042
              precision    recall  f1-score   support

           0       0.70      0.17      0.28       592
           1       0.78      0.23      0.35       599
           2       0.38      0.39      0.39      1041
           3       0.56      0.44      0.49       616
           4       0.34      0.68      0.45       947

    accuracy                           0.41      3795
   macro avg       0.55      0.38      0.39      3795
weighted avg       0.51      0.41      0.40      3795






Logistic Regression
0.4590250329380764
              precision    recall  f1-score   support

           0       0.56      0.35      0.43       592
           1       0.61      0.46      0.52       599
           2       0.44      0.39      0.41      1041
           3       0.47      0.61      0.53       616
           4       0.38      0.50      0.43       947

    accuracy                           0.46      3795
   macro avg       0.49      0.46      0.47      3795
weighted avg       0.48      0.46      0.46      3795


KNN
0.37602108036890647
              precision    recall  f1-score   support

           0       0.37      0.52      0.43       592
           1       0.40      0.54      0.46       599
           2       0.36      0.31      0.34      1041
           3       0.67      0.21      0.32       616
           4       0.32      0.36      0.34       947

    accuracy                           0.38      3795
   macro avg       0.43      0.39      0.38      3795
weighted avg 

# extracting word vector considering tf-idf

In [33]:
import pandas as pd 
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np

data2 = pd.read_csv("preprocess_train.csv", encoding = 'latin1') 
data2_test = pd.read_csv("preprocess_test_final.csv", encoding = 'latin1') 
data2 = data2.dropna()
data2_test = data2_test.dropna()
data2['tokens'] = data2['tokens'].apply(literal_eval)
data2_test['tokens'] = data2_test['tokens'].apply(literal_eval)
corpus = []
for item in data['tokens']:
    corpus.append(' '.join(item))

word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)
#build tf-idf vector
tfidf = TfidfVectorizer(min_df=3)
tfidf.fit(corpus)
feature_names = tfidf.get_feature_names()

def get_ifidf_for_words(text):
    tfidf_matrix= tfidf.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)

def w2vmean(l):
    text = ' '.join(l)
    tf_idf = get_ifidf_for_words(text)
    X1 = np.zeros((300,))
    for x in l:
        if x in word2vec and x in tf_idf:
            X1 += word2vec[x] * tf_idf[x]
    return X1

X2 = []
X2_test = []
y2 = []
y2_test = []
for i in range(len(data2)):
    X2.append(w2vmean(data2['tokens'].iloc[i]))
    y2.append(data2['Sentiment'].iloc[i])
for i in range(len(data2_test)):
    X2_test.append(w2vmean(data2_test['tokens'].iloc[i]))
    y2_test.append(data2_test['Sentiment'].iloc[i])

# Model Training (considering tf-idf)
- Random Forest
- Logistic Regression
- KNN

In [34]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X2)
scaled_X_test = scaler.fit_transform(X2_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y2)
scaled_y = encoder.transform(y2)
y_test = encoder.transform(y2_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Extremely Negative' 'Extremely Positive' 'Negative' 'Neutral' 'Positive']
[0 1 2 3 4]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  4.6min finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 30, 'n_estimators': 500}
Best parameters set found on development set:
[0.35915836 0.37971297 0.38676721 0.35417173 0.39272683 0.40391632
 0.35721236 0.38749696 0.40306495]
Best score: 
0.40391632206275846

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.2min finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.01}
Best parameters set found on development set:
[0.39260521 0.45341766 0.45755291 0.45475553 0.45439066]
Best score: 
0.45755290683531985

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.331063   0.35052299 0.35964486 0.36973972 0.37837509]
Best score: 
0.37837509121868157



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.9min finished


# Model Testing (considering tf-idf)

In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF2 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log2 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.01)
final_knn2 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF2,"Random Forest")
test_result(final_log2,"Logistic Regression")
test_result(final_knn2,"KNN")

Random Forest
0.4137022397891963
              precision    recall  f1-score   support

           0       0.72      0.16      0.27       592
           1       0.73      0.27      0.39       599
           2       0.40      0.38      0.39      1041
           3       0.56      0.45      0.50       616
           4       0.32      0.67      0.44       947

    accuracy                           0.41      3795
   macro avg       0.55      0.39      0.40      3795
weighted avg       0.51      0.41      0.40      3795






Logistic Regression
0.47299077733860345
              precision    recall  f1-score   support

           0       0.58      0.44      0.50       592
           1       0.59      0.54      0.56       599
           2       0.45      0.35      0.39      1041
           3       0.48      0.66      0.56       616
           4       0.39      0.46      0.42       947

    accuracy                           0.47      3795
   macro avg       0.50      0.49      0.49      3795
weighted avg       0.48      0.47      0.47      3795


KNN
0.38735177865612647
              precision    recall  f1-score   support

           0       0.44      0.31      0.37       592
           1       0.53      0.33      0.40       599
           2       0.37      0.37      0.37      1041
           3       0.38      0.62      0.47       616
           4       0.34      0.33      0.34       947

    accuracy                           0.39      3795
   macro avg       0.41      0.39      0.39      3795
weighted avg

# using self-trained word2vec 

In [37]:
import pandas as pd 
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np

new_model = Word2Vec.load('model.bin')

data3 = pd.read_csv("preprocess_train.csv", encoding = 'latin1') 
data3_test = pd.read_csv("preprocess_test_final.csv", encoding = 'latin1') 
data3 = data3.dropna()
data3_test = data3_test.dropna()
data3['tokens'] = data3['tokens'].apply(literal_eval)
data3_test['tokens'] = data3_test['tokens'].apply(literal_eval)

def w2vmean(l):
    X = np.zeros((300,))
    for x in l:
        if x in new_model:
            X += new_model[x]
    return X/len(l)

X3 = []
X3_test = []
y3 = []
y3_test = []
for i in range(len(data3)):
    X3.append(w2vmean(data3['tokens'].iloc[i]))
    y3.append(data3['Sentiment'].iloc[i])
for i in range(len(data3_test)):
    X3_test.append(w2vmean(data3_test['tokens'].iloc[i]))
    y3_test.append(data3_test['Sentiment'].iloc[i])
print(len(X3))
print(len(X3_test))




41106
3795


# Model Training (self-defined word2vec)
- Random Forest
- Logistic Regression
- KNN

In [38]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X3)
scaled_X_test = scaler.fit_transform(X3_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y3)
scaled_y = encoder.transform(y3)
y_test = encoder.transform(y3_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Extremely Negative' 'Extremely Positive' 'Negative' 'Neutral' 'Positive']
[0 1 2 3 4]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  3.4min finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 50, 'n_estimators': 500}
Best parameters set found on development set:
[0.37387497 0.38810508 0.39175383 0.37569934 0.40221357 0.40586232
 0.37764534 0.40245682 0.40853807]
Best score: 
0.40853806859644853

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   46.9s finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 1}
Best parameters set found on development set:
[0.36341523 0.40744344 0.42313306 0.42398443 0.42459256]
Best score: 
0.4245925565555826

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.29408903 0.32802238 0.32571151 0.34067137 0.34869861]
Best score: 
0.3486986134760399



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   35.6s finished


# Model Test (self-defined model)

In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF3 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log3 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=1)
final_knn3 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF3,"Random Forest")
test_result(final_log3,"Logistic Regression")
test_result(final_knn3,"KNN")

Random Forest
0.4036890645586298
              precision    recall  f1-score   support

           0       0.62      0.14      0.23       592
           1       0.63      0.25      0.36       599
           2       0.42      0.37      0.39      1041
           3       0.51      0.48      0.49       616
           4       0.32      0.65      0.43       947

    accuracy                           0.40      3795
   macro avg       0.50      0.38      0.38      3795
weighted avg       0.47      0.40      0.39      3795






Logistic Regression
0.40948616600790516
              precision    recall  f1-score   support

           0       0.49      0.27      0.35       592
           1       0.55      0.39      0.46       599
           2       0.40      0.35      0.38      1041
           3       0.43      0.58      0.49       616
           4       0.33      0.46      0.39       947

    accuracy                           0.41      3795
   macro avg       0.44      0.41      0.41      3795
weighted avg       0.43      0.41      0.41      3795


KNN
0.37760210803689065
              precision    recall  f1-score   support

           0       0.38      0.43      0.40       592
           1       0.39      0.50      0.44       599
           2       0.37      0.34      0.35      1041
           3       0.57      0.27      0.37       616
           4       0.33      0.38      0.35       947

    accuracy                           0.38      3795
   macro avg       0.41      0.38      0.38      3795
weighted avg

# Merging Categories
- negative
- positive
- neutral

In [43]:
import pandas as pd 
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np

data4 = pd.read_csv("preprocess_train.csv", encoding = 'latin1') 
data4_test = pd.read_csv("preprocess_test_final.csv", encoding = 'latin1') 
data4.loc[data4.Sentiment == "Extremely Negative", "Sentiment"] = "Negative"
data4.loc[data4.Sentiment == "Extremely Positive", "Sentiment"] = "Positive"
data4_test.loc[data4_test.Sentiment == "Extremely Negative", "Sentiment"] = "Negative"
data4_test.loc[data4_test.Sentiment == "Extremely Positive", "Sentiment"] = "Positive"
data4 = data4.dropna()
data4_test = data4_test.dropna()
data4['tokens'] = data4['tokens'].apply(literal_eval)
data4_test['tokens'] = data4_test['tokens'].apply(literal_eval)


corpus = []
for item in data4['tokens']:
    corpus.append(' '.join(item))

word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)
#build tf-idf vector
tfidf = TfidfVectorizer(min_df=3)
tfidf.fit(corpus)
feature_names = tfidf.get_feature_names()

def get_ifidf_for_words(text):
    tfidf_matrix= tfidf.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    return dict(tfidf_scores)

def w2vmean(l):
    text = ' '.join(l)
    tf_idf = get_ifidf_for_words(text)
    X1 = np.zeros((300,))
    for x in l:
        if x in word2vec and x in tf_idf:
            X1 += word2vec[x] * tf_idf[x]
    return X1

X4 = []
X4_test = []
y4 = []
y4_test = []
for i in range(len(data4)):
    X4.append(w2vmean(data4['tokens'].iloc[i]))
    y4.append(data4['Sentiment'].iloc[i])
for i in range(len(data4_test)):
    X4_test.append(w2vmean(data4_test['tokens'].iloc[i]))
    y4_test.append(data4_test['Sentiment'].iloc[i])

# Model Training (considering tf-idf)
- Random Forest
- Logistic Regression
- KNN

In [44]:
# devide data to training data (80%) and validation data (20%)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X4)
scaled_X_test = scaler.fit_transform(X4_test)
pca = PCA(.95)
pca.fit(scaled_X)
new_X = pca.transform(scaled_X)
X_test = pca.transform(scaled_X_test)
encoder = LabelEncoder()
encoder.fit(y4)
scaled_y = encoder.transform(y4)
y_test = encoder.transform(y4_test)
print(encoder.classes_)
print(encoder.transform(encoder.classes_))
train, cross, label_train, label_cross = train_test_split(new_X, scaled_y, test_size=0.2, random_state=43)


#tuning hyperparameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

NB = MultinomialNB()
RForest = RandomForestClassifier(random_state=1, n_jobs=-1) #naive bayes
logistic = LogisticRegression(random_state=0, solver='saga',n_jobs = 1)
knn = KNeighborsClassifier()
rf_params = {'n_estimators': [50,200,500],
              'max_depth':[10, 30, 50]}
nb_params = {'alpha': [0,0.001,0.01,0.1,1,10,100]}
log_params = {'C':[0.0001,0.001,0.01,0.1,1]}
knn_params = {'n_neighbors':[3,5,7,10,15]}

def tune_param(esti, param, alg):
    eva = GridSearchCV(estimator=esti, 
                     param_grid=param, 
                     cv=5,
                     verbose=1, 
                     scoring='accuracy')
    eva.fit(cross, label_cross)
    print("Cross Validation of "+alg)
    print("Best parameters set found on development set:")
    print(eva.best_params_)
    print("Best parameters set found on development set:")
    print(eva.cv_results_['mean_test_score'])
    print("Best score: ")
    print(eva.best_score_)
    print()

#tune_param(NB, nb_params, "Navie Bayes")
tune_param(RForest, rf_params, "Random Forest")
tune_param(logistic, log_params, "logistic regression")
tune_param(knn, knn_params, "KNN")

['Negative' 'Neutral' 'Positive']
[0 1 2]
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  4.5min finished


Cross Validation of Random Forest
Best parameters set found on development set:
{'max_depth': 30, 'n_estimators': 500}
Best parameters set found on development set:
[0.57236682 0.58209681 0.58890781 0.57017757 0.59121868 0.60094867
 0.56616395 0.59085381 0.59924593]
Best score: 
0.6009486742884943

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.1min finished


Cross Validation of logistic regression
Best parameters set found on development set:
{'C': 0.1}
Best parameters set found on development set:
[0.61104354 0.65300414 0.65774751 0.65872051 0.65774751]
Best score: 
0.6587205059596205

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross Validation of KNN
Best parameters set found on development set:
{'n_neighbors': 15}
Best parameters set found on development set:
[0.54098759 0.55923133 0.57406957 0.58246169 0.59754318]
Best score: 
0.5975431768426174



[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.0min finished


# Model Test (Merging)

In [45]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
final_RF4 = RandomForestClassifier(max_depth=30,n_estimators=500, random_state=1, n_jobs=-1)
final_log4 = LogisticRegression(random_state=0, solver='saga',n_jobs = 1, C=0.1)
final_knn4 = KNeighborsClassifier(n_neighbors=15)
def test_result(model,name):
    model.fit(train, label_train)
    y_predict = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, y_predict))
    print(classification_report(y_test, y_predict))
    print()
test_result(final_RF4,"Random Forest")
test_result(final_log4,"Logistic Regression")
test_result(final_knn4,"KNN")

Random Forest
0.6200263504611331
              precision    recall  f1-score   support

           0       0.72      0.56      0.63      1633
           1       0.71      0.26      0.38       616
           2       0.56      0.83      0.67      1546

    accuracy                           0.62      3795
   macro avg       0.66      0.55      0.56      3795
weighted avg       0.65      0.62      0.60      3795






Logistic Regression
0.6671936758893281
              precision    recall  f1-score   support

           0       0.73      0.64      0.68      1633
           1       0.57      0.55      0.56       616
           2       0.65      0.75      0.70      1546

    accuracy                           0.67      3795
   macro avg       0.65      0.64      0.64      3795
weighted avg       0.67      0.67      0.67      3795


KNN
0.6005270092226614
              precision    recall  f1-score   support

           0       0.66      0.61      0.63      1633
           1       0.47      0.53      0.50       616
           2       0.61      0.62      0.61      1546

    accuracy                           0.60      3795
   macro avg       0.58      0.59      0.58      3795
weighted avg       0.60      0.60      0.60      3795


