<a href="https://colab.research.google.com/github/KosukhaOlexandr/reactions_prediction/blob/main/reaction_prediction_lemmatized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import pandas as pd

with open('/content/drive/MyDrive/reaction_prediction/concatenated_pos_neg.csv') as f:
  data = pd.read_csv(f, index_col = 0)
with open('/content/drive/MyDrive/reaction_prediction/stopwords.txt') as f:
  stopwords = pd.read_table(f, header = None)
with open('/content/drive/MyDrive/reaction_prediction/word2lem.csv') as f:
  word_to_lemma = pd.read_csv(f, index_col = 0)

word_to_lemma = word_to_lemma.to_dict(orient='dict')
word_to_lemma = word_to_lemma['0']

In [None]:
data.head()

Unnamed: 0,msg_text,reaction_type
0,чий борщ у п’ятницю липня юнеско вирішить долю...,1
1,зруйнований міст у демидові на київщині планую...,1
2,безперечно головне слово сьогодні зміїний мабу...,1
3,прокремлівське видання общественная служба нов...,1
4,продовжується відвантаження боєприпасів продов...,1


In [None]:
def lemmatize(text):
  return ' '.join([word_to_lemma.get(w, '') for w in text.split()])

In [None]:
data.msg_text = data.msg_text.apply(lemmatize)

In [None]:
data.head()

Unnamed: 0,msg_text,reaction_type
0,чий борщ у п’ятниця липень юнеско вирішити дол...,1
1,зруйнований місто у демид на київщина плануват...,1
2,безперечно головне слово сьогодні зміїний мабу...,1
3,прокремлівський видання общественная служба но...,1
4,продовжуватися відвантаження боєприпас продовж...,1


## Naive Bayes

### Count Vectorizer

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
bow = count_vec.fit_transform(data.msg_text)

X = bow
Y = data.reaction_type

In [None]:
from sklearn.model_selection import train_test_split
X_train_cv, X_test_cv, Y_train_cv, Y_test_cv = train_test_split(X, Y, test_size = 0.20, stratify = Y, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_cv = MultinomialNB(alpha = 0.01).fit(X_train_cv, Y_train_cv)

In [None]:
Y_pred_test = nb_cv.predict(X_test_cv)
Y_pred_train = nb_cv.predict(X_train_cv)

In [None]:
from sklearn.metrics import accuracy_score

print('Test set accuracy:', accuracy_score(Y_test_cv, Y_pred_test))
print('Train set accuracy:', accuracy_score(Y_train_cv, Y_pred_train))

Test set accuracy: 0.8003779825183085
Train set accuracy: 0.868786297863963


In [None]:
from sklearn.metrics import f1_score

print('Test set F1 score:', f1_score(Y_test_cv, Y_pred_test))
print('Train set F1 score:', f1_score(Y_train_cv, Y_pred_train))

Test set F1 score: 0.8253771440380244
Train set F1 score: 0.8845587598510435


## TF-IDF Vectorizer

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vec = TfidfVectorizer()
bow_ti = tf_idf_vec.fit_transform(data.msg_text)
bow_ti

<63494x60394 sparse matrix of type '<class 'numpy.float64'>'
	with 2354288 stored elements in Compressed Sparse Row format>

In [None]:
X = bow_ti
Y = data.reaction_type

X_train_ti, X_test_ti, Y_train_ti, Y_test_ti = train_test_split(X, Y, test_size = 0.20, stratify = Y, random_state = 42)

nb_tf_idf = MultinomialNB(alpha = 0.01).fit(X_train_ti, Y_train_ti)

In [None]:
Y_pred_test = nb_tf_idf.predict(X_test_ti)
Y_pred_train = nb_tf_idf.predict(X_train_ti)

In [None]:
from sklearn.metrics import accuracy_score

print('Test set accuracy:', accuracy_score(Y_test_ti, Y_pred_test))
print('Train set accuracy:', accuracy_score(Y_train_ti, Y_pred_train))

Test set accuracy: 0.8143160878809355
Train set accuracy: 0.8809725366669948


In [None]:
from sklearn.metrics import f1_score

print('Test set F1 score:', f1_score(Y_test_ti, Y_pred_test))
print('Train set F1 score:', f1_score(Y_train_ti, Y_pred_train))

Test set F1 score: 0.8468035343035343
Train set F1 score: 0.9000628119937849


#SVC

### TF-IDF

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

X = bow_ti
Y = data.reaction_type

svc_tf_idf = make_pipeline(StandardScaler(with_mean=False), SVC(C = 1, gamma='auto'))
svc_tf_idf.fit(X_train_ti, Y_train_ti)

In [None]:
Y_pred_test = svc_tf_idf.predict(X_test_ti)
Y_pred_train = svc_tf_idf.predict(X_train_ti)

In [None]:
from sklearn.metrics import accuracy_score

print('Test set accuracy:', accuracy_score(Y_test_ti, Y_pred_test))
print('Train set accuracy:', accuracy_score(Y_train_ti, Y_pred_train))

Test set accuracy: 0.8210882746672966
Train set accuracy: 0.9543262132099616


In [None]:
from sklearn.metrics import f1_score

print('Test set F1 score:', f1_score(Y_test_ti, Y_pred_test))
print('Train set F1 score:', f1_score(Y_train_ti, Y_pred_train))

Test set F1 score: 0.8586361373817819
Train set F1 score: 0.9611598473180205


### Count Vectorizer

In [None]:
svc_cv = make_pipeline(StandardScaler(with_mean=False), SVC(C = 1, gamma='auto'))
svc_cv.fit(X_train_cv, Y_train_cv)

In [None]:
Y_pred_test = svc_cv.predict(X_test_cv)
Y_pred_train = svc_cv.predict(X_train_cv)

In [None]:
print('Test set accuracy:', accuracy_score(Y_test_cv, Y_pred_test))
print('Train set accuracy:', accuracy_score(Y_train_cv, Y_pred_train))

print('Test set F1 score:', f1_score(Y_test_cv, Y_pred_test))
print('Train set F1 score:', f1_score(Y_train_cv, Y_pred_train))

Test set accuracy: 0.8236869044806677
Train set accuracy: 0.941706860911507
Test set F1 score: 0.8581924124390399
Train set F1 score: 0.950239475674313
