In [1]:
%run feature_extraction.ipynb

1.6597412693274927e-110
4.2552940390803975e-123


In [2]:
import tqdm
import torch
import numpy as np
import pandas as pd
from numpy import mean, std
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, GridSearchCV

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def edit_categories(x):
    if x == 'info_news':
        return 0
    elif x == 'celebrity':
        return 1
    elif x == 'plan':
        return 2
    elif x == 'requests':
        return 3
    elif x == 'rumors':
        return 4
    elif x == 'advice':
        return 5
    elif x == 'restrictions':
        return 6
    elif x == 'personal':
        return 7
    elif x == 'unrelated':
        return 8
    elif x == 'others':
        return 9
    else:
        return -1

In [4]:
train_1=pd.read_pickle('output/train_3_original.pkl')
dev_1=pd.read_pickle('output/dev_1_original.pkl')

In [5]:
sentences_train, stance_train, category_train = train_1.loc[:,"tokens"],train_1.loc[:,"stance"],train_1.loc[:,"category"]
sentences_dev, stance_dev, category_dev = dev_1.loc[:,"tokens"],dev_1.loc[:,"stance"],dev_1.loc[:,"category"]

In [6]:
# Get CBOW & Skip Gram Features for training and dev
X_train_cbow_w2v, X_train_sg_w2v = get_word_embeddings_features(sentences_train)
X_dev_cbow_w2v, X_dev_sg_w2v = get_word_embeddings_features(sentences_dev)

In [7]:
min_x_train_cbow =abs(np.min(X_train_cbow_w2v))
min_x_dev_cbow= abs(np.min(X_dev_cbow_w2v))
min_x_train_sg = abs(np.min(X_train_sg_w2v))
min_x_dev_sg = abs(np.min(X_dev_sg_w2v))

In [9]:
feature_train=[]
feature_dev=[]
for i in range(0, len(sentences_train)):
  
    new_feature= X_train_cbow_w2v[i]+min_x_train_cbow
    new_feature = np.concatenate((new_feature,X_train_sg_w2v[i]+min_x_train_sg))
    feature_train.append(new_feature)
for i in range(0,len(sentences_dev)):
    new_feature= X_dev_cbow_w2v[i]+min_x_dev_cbow
    new_feature= np.concatenate((new_feature,X_dev_sg_w2v[i]+min_x_dev_sg))
    feature_dev.append(new_feature)

In [10]:
# x_train = X_train_cbow_w2v + X_train_sg_w2v
# x_dev = X_dev_cbow_w2v + X_dev_sg_w2v 
# here is handled for Naive Bayes
x_train =feature_train
x_dev = feature_dev

In [11]:
#Fitting The Stance Detection Model using Logistic Regression (W2v)
lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(x_train, stance_train)  #model

#Predict y value for test dataset
stance_predict = lr_w2v.predict(x_dev)
stance_prob = lr_w2v.predict_proba(x_dev)[:,1]

#Fitting The Classification Model using Logistic Regression (W2v)
lr_w2v2=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v2.fit(x_train, category_train)  #model

#Predict y value for test dataset
categ_predict = lr_w2v2.predict(x_dev)
categ_prob = lr_w2v2.predict_proba(x_dev)[:,1]

In [12]:
print(metrics.classification_report(stance_predict,stance_dev))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         0
           2       1.00      0.80      0.89       999

    accuracy                           0.80      1000
   macro avg       0.33      0.27      0.30      1000
weighted avg       1.00      0.80      0.89      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
print(metrics.classification_report(categ_predict,category_dev))

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00         0
   celebrity       0.00      0.00      0.00         0
   info_news       1.00      0.55      0.71      1000
      others       0.00      0.00      0.00         0
    personal       0.00      0.00      0.00         0
        plan       0.00      0.00      0.00         0
    requests       0.00      0.00      0.00         0
restrictions       0.00      0.00      0.00         0
      rumors       0.00      0.00      0.00         0
   unrelated       0.00      0.00      0.00         0

    accuracy                           0.55      1000
   macro avg       0.10      0.05      0.07      1000
weighted avg       1.00      0.55      0.71      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Create model(naive bayes) and training. 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(feature_train, stance_train)

In [16]:
predicted = clf.predict(feature_dev)
# Model Accuracy, how often is the classifier correct?
train_score = accuracy_score(stance_train,clf.predict(feature_train))
print("train accuarcy", train_score)
print("Accuracy:",metrics.accuracy_score(stance_dev, predicted)*100)
print(len(predicted[stance_dev==0]))
print(len(predicted[stance_dev==1]))
print(len(predicted[stance_dev==2]))

train accuarcy 0.52
Accuracy: 17.1
70
126
804


In [17]:
print(metrics.classification_report(stance_dev,predicted))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        70
           1       0.13      0.97      0.23       126
           2       0.88      0.06      0.11       804

    accuracy                           0.17      1000
   macro avg       0.33      0.34      0.11      1000
weighted avg       0.72      0.17      0.12      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# Create model(naive bayes) and training. 
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(feature_train, category_train)

In [19]:
predicted = clf.predict(feature_dev)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(category_dev, predicted)*100)
train_score = accuracy_score(category_train,clf.predict(feature_train))
print("train accuarcy", train_score)

Accuracy: 54.50000000000001
train accuarcy 0.497


In [20]:
print(metrics.classification_report(category_dev,predicted))

              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        10
   celebrity       0.00      0.00      0.00       145
   info_news       0.55      1.00      0.71       545
      others       0.00      0.00      0.00        17
    personal       0.00      0.00      0.00       128
        plan       0.00      0.00      0.00        82
    requests       0.00      0.00      0.00        20
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        15
   unrelated       0.00      0.00      0.00        36

    accuracy                           0.55      1000
   macro avg       0.05      0.10      0.07      1000
weighted avg       0.30      0.55      0.38      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
