In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("News_dataset_preprocessed.csv")
articles = [w for w in dataset["text"]]

In [3]:
pickle_in = open("word2vec_cbow_vectors.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)

In [4]:
def getArticleVector(article):
    return np.mean([word for word in article], axis = 0)

In [5]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getArticleVector(word_vectors_CBOW[i]))

In [6]:
y = dataset["true"]

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.97438753 0.97371938 0.97461024 0.97728285 0.97260579 0.97171492
 0.97616927 0.97639198 0.97371352 0.97549566]
Average CV accuracy score:  0.9746091137901557


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.97400993 0.97193639 0.97513293 0.97621792 0.97197299 0.97020578
 0.97617373 0.97350272 0.97374646 0.97553635]
Average CV precision score:  0.973843521344044


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.9752784  0.97171492 0.97438753 0.97706013 0.97349666 0.97104677
 0.9752784  0.97349666 0.97460459 0.97660949]
Average CV recall score:  0.9742973544338275


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.97527441 0.97304671 0.97526851 0.97816895 0.97171115 0.97371309
 0.97572041 0.97505657 0.97415009 0.97704883]
Average CV f1 score:  0.974915871012038


In [12]:
pickle_in = open("word2vec_skip-gram_vectors.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)

In [13]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getArticleVector(word_vectors_Skip_Gram[i]))

In [14]:
rfc = RandomForestClassifier(n_estimators = 100)

In [15]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.97327394 0.97461024 0.97594655 0.97839644 0.97594655 0.97327394
 0.97728285 0.97928731 0.97750056 0.97215415]
Average CV accuracy score:  0.9757672528888979


In [16]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.9753054  0.97617169 0.97783193 0.97800067 0.97507896 0.97195939
 0.97728895 0.97840386 0.97574247 0.97351922]
Average CV precision score:  0.9759302545035917


In [17]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.97394209 0.97438753 0.97639198 0.9766147  0.97639198 0.9714922
 0.97728285 0.97951002 0.97616396 0.97371352]
Average CV recall score:  0.9755890841309194


In [18]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.97304636 0.97616897 0.97549094 0.97616584 0.97638994 0.97371141
 0.97661182 0.97773021 0.97660708 0.97281911]
Average CV f1 score:  0.975474169437763
