#Imports

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

#Load Data

In [None]:
dataset = pd.read_csv('news.csv')

#Inspecting data

In [None]:
if dataset.isnull().sum().sum():
  print("Missing values found.")
else:
  print("No missing entries in dataset.")

No missing entries in dataset.


In [None]:
Text = dataset['text']
Labels = dataset['label']

#Train-test-split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(Text, Labels, test_size=0.2, random_state=7)

#Initialising a TF-IDF Vectoriser

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import  stopwords
stopwordslist = set(stopwords.words('english'))#removes negative n't words from the list so they are kept in the corpus
neg = set(['not'])
for word in stopwordslist:
    if 'n\'t' in word:
        neg.add(word)
        neg.add(word[:-2])

stopwordslist = stopwordslist - neg

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
TFIDF = TfidfVectorizer(stop_words=stopwordslist, max_df=0.7)#max_df will ignore terms with a document frequency higher than that threshold set
'''
look into adjusting the stop words by removing the negative ones as with the data science course.
'''
TFIDF_train = TFIDF.fit_transform(X_train)
TFIDF_test = TFIDF.transform(X_test)

#Initialising a PassiveAggressive Classifier

Overview of a passive aggressive classifier:
Learns on the fly from a large dataset of text/documents.
The amount of information is too vast to store and so it learns from it then discards it, i.e. learning from tweets -> updated step-by-step rather than batch learning.

Passive: If the prediction is correct, keep the model and do not make any changes. i.e., the data in the example is not enough to cause any changes in the model. 
Aggressive: If the prediction is incorrect, make changes to the model. i.e., some change to the model may correct it.

In [None]:
PAC = PassiveAggressiveClassifier(max_iter=100, early_stopping=True, n_iter_no_change=5)#max_iter is akin to maximum number of training epochs
'''
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
Several parameters may improve model performance.
'''
PAC.fit(TFIDF_train,Y_train)

PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
                            early_stopping=True, fit_intercept=True,
                            loss='hinge', max_iter=100, n_iter_no_change=5,
                            n_jobs=None, random_state=None, shuffle=True,
                            tol=0.001, validation_fraction=0.1, verbose=0,
                            warm_start=False)

#Classifier predictions and score

In [None]:
Y_pred = PAC.predict(TFIDF_test)
score = accuracy_score(Y_test,Y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.98%


#Confusion Matrix

In [None]:
confusion_matrix(Y_test,Y_pred, labels=['FAKE','REAL'])

array([[596,  42],
       [ 47, 582]])