In [18]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report as cr
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [19]:
dataset = pd.read_csv("News_dataset_preprocessed.csv")
articles = [w for w in dataset["text"]]

In [20]:
pickle_in = open("word2vec_cbow_vectors.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)

In [21]:
def getArticleVector(article):
    return np.mean([word for word in article], axis = 0)

In [22]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getArticleVector(word_vectors_CBOW[i]))

In [23]:
y = dataset["true"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [25]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [26]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.9729398663697104
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      4709
           1       0.97      0.97      0.97      4271

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [27]:
# 10-fold cross-validation - CBOW
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [28]:
# 10-fold cross-validation results - CBOW
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.97505568 0.97438753 0.97550111 0.97728285 0.97305122 0.97238307
 0.97750557 0.97639198 0.97393629 0.97527289]
Avg CV score:  0.9750768198035187


In [29]:
pickle_in = open("word2vec_skip-gram_vectors.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)

In [30]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getArticleVector(word_vectors_Skip_Gram[i]))

In [31]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [32]:
rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [33]:
score = accuracy_score(y_test, y_pred)
print(score)
print(cr(y_test, y_pred))

0.9728285077951002
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      4709
           1       0.97      0.97      0.97      4271

    accuracy                           0.97      8980
   macro avg       0.97      0.97      0.97      8980
weighted avg       0.97      0.97      0.97      8980



In [34]:
# 10-fold cross-validation - Skip-Gram
scores = cross_val_score(rfc, x, y, cv = k_folds)

In [35]:
# 10-fold cross-validation results - Skip-Gram
print("CV scores: ", scores)
print("Avg CV score: ", scores.mean())

CV scores:  [0.97349666 0.97572383 0.9752784  0.97750557 0.97483296 0.97305122
 0.97394209 0.97750557 0.97727779 0.97237692]
Avg CV score:  0.9750991014412366
