In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
import pickle

warnings.simplefilter(action = 'ignore', category = FutureWarning)
k_folds = KFold(n_splits = 10)

In [2]:
dataset = pd.read_csv("News_dataset_preprocessed.csv")
articles = [w for w in dataset["text"]]

In [3]:
pickle_in = open("fasttext_vectors.pickle", "rb")
word_vectors_fastText = pickle.load(pickle_in)

In [4]:
def getArticleVector(article):
    return np.mean([word for word in article], axis = 0)

In [5]:
x = []
for i in range(len(word_vectors_fastText)):
    x.append(getArticleVector(word_vectors_fastText[i]))

In [6]:
y = dataset["true"]

In [7]:
rfc = RandomForestClassifier(n_estimators = 100)

In [8]:
# 10-fold cross-validation - accuracy
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'accuracy')
print("CV accuracy scores: ", scores)
print("Average CV accuracy score: ", scores.mean())

CV accuracy scores:  [0.97193764 0.97282851 0.97371938 0.97461024 0.97171492 0.97126949
 0.97394209 0.97594655 0.97282245 0.97282245]
Average CV accuracy score:  0.973161372937857


In [9]:
# 10-fold cross-validation - precision
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'precision_weighted')
print("CV precision scores: ", scores)
print("Average CV precision score: ", scores.mean())

CV precision scores:  [0.97131719 0.97194292 0.9740746  0.97464605 0.9688611  0.96883934
 0.97399127 0.97528198 0.9732827  0.97335561]
Average CV precision score:  0.9725592763826031


In [10]:
# 10-fold cross-validation - recall
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'recall_weighted')
print("CV recall scores: ", scores)
print("Average CV recall score: ", scores.mean())

CV recall scores:  [0.97260579 0.97260579 0.97461024 0.97438753 0.97126949 0.9714922
 0.97260579 0.97438753 0.97304522 0.97393629]
Average CV recall score:  0.9730945875614779


In [11]:
# 10-fold cross-validation - f1-score
scores = cross_val_score(rfc, x, y, cv = k_folds, scoring = 'f1_weighted')
print("CV f1 scores: ", scores)
print("Average CV f1 score: ", scores.mean())

CV f1 scores:  [0.97393362 0.97126467 0.97236989 0.97593793 0.97059596 0.96880388
 0.97594293 0.97661221 0.97036453 0.9725867 ]
Average CV f1 score:  0.9728412314403082
