In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# This pulls the data

file_id = '1soU7OGgejkFmz6vIxEXAGJAVg-1xeN0A'
url = f'https://drive.google.com/uc?id=1soU7OGgejkFmz6vIxEXAGJAVg-1xeN0A'

df = pd.read_csv(url)
print("Data loaded successfully from Google Drive!")

print(df.head())
df.shape

labels=df.label  # getting labels from dataframe
labels.head()

Data loaded successfully from Google Drive!
   Unnamed: 0                                              title  \
0        8476                       You Can Smell Hillary’s Fear   
1       10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2        3608        Kerry to go to Paris in gesture of sympathy   
3       10142  Bernie supporters on Twitter erupt in anger ag...   
4         875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


Unnamed: 0,label
0,FAKE
1,FAKE
2,REAL
3,FAKE
4,REAL


In [2]:
# slitting data into training and testing sets
# use a test_size 30% of the data will be held back for testing
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.3, random_state=45) # random_state=45 ensures the split is the same every time the code runs, for reproducibility

In [None]:
# Instantiate the TF-IDF Vectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) # stop_words='english tells the vectorizer to ignore common English words.
                                                                   # max_df=0.7 ignores words that appear in more than 70% of the dataset
tfidf_train=tfidf_vectorizer.fit_transform(x_train)  # Fit Learns the vocabulary and calculates the IDF
tfidf_test=tfidf_vectorizer.transform(x_test) # Transform Converts the text documents into numerical TF-IDF vectors

In [None]:
# Initialize a Classifier
pas_ag_cls=PassiveAggressiveClassifier(max_iter=50) # max_iter=50 sets the maximum number of passes over the training data
pas_ag_cls.fit(tfidf_train,y_train) # Fit the classifier to the training data (features and labels)

y_pred=pas_ag_cls.predict(tfidf_test) # Use the trained model to predict the class labels for test features
model_accuracy=accuracy_score(y_test,y_pred) # checking accuracy of model
print(f'Accuracy: {round(model_accuracy*100,2)}%')

Accuracy: 94.84%


In [None]:
# Build confusion matrix for evaluation
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

array([[904,  42],
       [ 56, 899]])