In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("News_dataset_preprocessed.csv")
dataset.drop(dataset.tail(39898).index, inplace = True)
articles = [w for w in dataset["text"]]

In [3]:
pickle_in = open("word2vec_cbow_vectors.pickle", "rb")
word_vectors_CBOW = pickle.load(pickle_in)
del word_vectors_CBOW[-39898:]

In [4]:
def getArticleVector(article):
    return np.mean([word for word in article], axis = 0)

In [5]:
# CBOW
x = []
for i in range(len(word_vectors_CBOW)):
    x.append(getArticleVector(word_vectors_CBOW[i]))

In [6]:
y = dataset["true"]

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.946 0.952 0.962 0.954 0.954 0.956 0.946 0.954 0.938 0.964]
Average CV accuracy score:  0.9526


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.94440374 0.94627023 0.95285443 0.9500032  0.95213415 0.96201718
 0.9484855  0.952      0.94599502 0.95270206]
Average CV precision score:  0.9506865518563755


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.944 0.95  0.96  0.954 0.946 0.958 0.944 0.946 0.94  0.96 ]
Average CV recall score:  0.9501999999999999


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.94806046 0.94801675 0.96202881 0.95200307 0.94999099 0.96200473
 0.94599806 0.95199693 0.93998555 0.94797669]
Average CV f1 score:  0.9508062032760389


In [12]:
pickle_in = open("word2vec_skip-gram_vectors.pickle", "rb")
word_vectors_Skip_Gram = pickle.load(pickle_in)
del word_vectors_Skip_Gram[-39898:]

In [13]:
# Skip-Gram
x = []
for i in range(len(word_vectors_Skip_Gram)):
    x.append(getArticleVector(word_vectors_Skip_Gram[i]))

In [14]:
rfc = RandomForestClassifier(n_estimators = 100)

In [15]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.942 0.956 0.96  0.958 0.95  0.962 0.938 0.942 0.946 0.954]
Average CV accuracy score:  0.9508000000000001


In [16]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.94       0.94611927 0.95539063 0.94802037 0.944      0.96201718
 0.94660936 0.95035308 0.94199406 0.96411196]
Average CV precision score:  0.9498615911087182


In [17]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.944 0.944 0.966 0.956 0.948 0.958 0.938 0.942 0.938 0.956]
Average CV recall score:  0.9489999999999998


In [18]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.94205739 0.94600892 0.96203429 0.95799344 0.94800666 0.96201327
 0.936      0.93799777 0.944      0.96199164]
Average CV f1 score:  0.949810336662944
