In [1]:
import pandas as pd
import numpy as np
import joblib 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


In [3]:
df=pd.read_csv('news.csv', index_col=None)

In [5]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [7]:
dataset = df.drop('Unnamed: 0',axis=1)

In [9]:
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [11]:
y=dataset["label"]

In [13]:
x_train, x_test, y_train, y_test = train_test_split(dataset['text'], y, test_size=0.33, random_state=53)

### Hash Vector

In [16]:
hash_vectorizer = HashingVectorizer(n_features = 2000,alternate_sign=False)  # Only positive counts
hash_train = hash_vectorizer.fit_transform(x_train)
print(hash_train)
hash_test = hash_vectorizer.transform(x_test)

  (1, 2)	0.15476464650682736
  (1, 45)	0.07738232325341368
  (1, 61)	0.15476464650682736
  (1, 70)	0.07738232325341368
  (1, 286)	0.07738232325341368
  (1, 323)	0.07738232325341368
  (1, 345)	0.07738232325341368
  (1, 410)	0.07738232325341368
  (1, 462)	0.07738232325341368
  (1, 510)	0.15476464650682736
  (1, 579)	0.15476464650682736
  (1, 689)	0.3095292930136547
  (1, 752)	0.07738232325341368
  (1, 797)	0.07738232325341368
  (1, 898)	0.15476464650682736
  (1, 953)	0.07738232325341368
  (1, 958)	0.5416762627738958
  (1, 976)	0.15476464650682736
  (1, 982)	0.07738232325341368
  (1, 1048)	0.15476464650682736
  (1, 1075)	0.15476464650682736
  (1, 1082)	0.07738232325341368
  (1, 1111)	0.07738232325341368
  (1, 1196)	0.15476464650682736
  (1, 1197)	0.3095292930136547
  :	:
  (4243, 1799)	0.014723405481714223
  (4243, 1810)	0.014723405481714223
  (4243, 1811)	0.014723405481714223
  (4243, 1812)	0.04417021644514267
  (4243, 1820)	0.014723405481714223
  (4243, 1828)	0.014723405481714223
  (424

In [17]:
print(hash_train.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.15476465 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.02534286 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


### Train ML models

In [21]:
# Dictionary to store model results
results = {}

# Multinomial Naive Bayes
MNB_classifier = MultinomialNB()
MNB_classifier.fit(hash_train, y_train)
MNB_pred = MNB_classifier.predict(hash_test)
MNB_accuracy = metrics.accuracy_score(y_test, MNB_pred)
print(f"MultinomialNB Classifier Accuracy: {MNB_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, MNB_pred, labels=['FAKE', 'REAL'])
results["MultinomialNB"] = (MNB_accuracy, MNB_classifier)

# Passive Aggressive
PAC_classifier = PassiveAggressiveClassifier(max_iter=1000, random_state=42)
PAC_classifier.fit(hash_train, y_train)
PAC_pred = PAC_classifier.predict(hash_test)
PAC_accuracy = metrics.accuracy_score(y_test, PAC_pred)
print(f"Passive Aggressive Classifier Accuracy: {PAC_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, PAC_pred, labels=['FAKE', 'REAL'])
results["PassiveAggressive"] = (PAC_accuracy, PAC_classifier)

# Logistic Regression
logisticReg_classifier = LogisticRegression()
logisticReg_classifier.fit(hash_train, y_train)
logistic_pred = logisticReg_classifier.predict(hash_test)
logistic_accuracy = metrics.accuracy_score(y_test, logistic_pred)
print(f"Logistic Regressor Accuracy: {logistic_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, logistic_pred, labels=['FAKE', 'REAL'])
results["LogisticRegression"] = (logistic_accuracy, logisticReg_classifier)

# Random Forest
random_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_classifier.fit(hash_train, y_train)
random_pred = random_classifier.predict(hash_test)
random_accuracy = metrics.accuracy_score(y_test, random_pred)
print(f"Random Forest Accuracy: {random_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, random_pred, labels=['FAKE', 'REAL'])
results["RandomForest"] = (random_accuracy, random_classifier)


# SVM
svm_classifier = SVC(kernel='poly')
svm_classifier.fit(hash_train, y_train)
svm_pred = random_classifier.predict(hash_test)
svm_accuracy = metrics.accuracy_score(y_test, random_pred)
print(f"SVM Accuracy: {svm_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, svm_pred, labels=['FAKE', 'REAL'])
results["SVM"] = (svm_accuracy, svm_classifier)


MultinomialNB Classifier Accuracy: 0.8699186991869918
Passive Aggressive Classifier Accuracy: 0.8919177427068389
Logistic Regressor Accuracy: 0.8775705404112865
Random Forest Accuracy: 0.8718316594930655
SVM Accuracy: 0.8718316594930655


### Classification Report

In [24]:
#classification report for MultipolynomialNB
from sklearn.metrics import classification_report
print("MultipolynomialNB classification report")
print("***************************************")
MNB_report=classification_report(y_test, MNB_pred)
print(MNB_report)

#classification report for Passive Aggressive Classifier
from sklearn.metrics import classification_report
print("Passive Aggressive Classifier classification report")
print("***************************************************")
PAC_report=classification_report(y_test, PAC_pred)
print(PAC_report)

#classification report for Logistic classification
from sklearn.metrics import classification_report
print("Logistic Regressor classification report")
print("****************************************")
logistic_report=classification_report(y_test, logistic_pred)
print(logistic_report)

#classification report for Random Forest
from sklearn.metrics import classification_report
print("Random Forest Regressor classification report")
print("****************************************")
svm_report=classification_report(y_test, svm_pred)
print(svm_report)

#classification report for SVM
from sklearn.metrics import classification_report
print("SVM Regressor classification report")
print("****************************************")
svm_report=classification_report(y_test, svm_pred)
print(svm_report)

MultipolynomialNB classification report
***************************************
              precision    recall  f1-score   support

        FAKE       0.86      0.87      0.87      1008
        REAL       0.88      0.87      0.87      1083

    accuracy                           0.87      2091
   macro avg       0.87      0.87      0.87      2091
weighted avg       0.87      0.87      0.87      2091

Passive Aggressive Classifier classification report
***************************************************
              precision    recall  f1-score   support

        FAKE       0.88      0.90      0.89      1008
        REAL       0.90      0.89      0.89      1083

    accuracy                           0.89      2091
   macro avg       0.89      0.89      0.89      2091
weighted avg       0.89      0.89      0.89      2091

Logistic Regressor classification report
****************************************
              precision    recall  f1-score   support

        FAKE       0.86  

In [39]:
best_model_name = max(results, key=lambda k: results[k][0])
best_accuracy, best_model = results[best_model_name]
print(f"Best Model: {best_model_name} with accuracy {best_accuracy:.3f}")

Best Model: PassiveAggressive with accuracy 0.892


In [41]:
results[best_model_name]

(0.8919177427068389, PassiveAggressiveClassifier(random_state=42))

### Save the best model

In [62]:
from sklearn.pipeline import Pipeline

# Create a pipeline with vectorizer and classifier
pipeline = Pipeline([
    ('vectorizer', HashingVectorizer(n_features=2000)),  # or your actual vectorizer
    ('classifier', PassiveAggressiveClassifier(random_state=42))
])

# Fit the pipeline on training data
pipeline.fit(dataset["text"], dataset["label"])

# Save the whole pipeline
joblib.dump(pipeline, "best_pipeline.pkl")

['best_pipeline.pkl']

### Load the saved best model

In [65]:
pipeline = joblib.load("best_pipeline.pkl")
predictions = pipeline.predict([dataset["text"][7]])
print(predictions)

['REAL']


### For Checking 

In [51]:
x_train[[2]]

2    U.S. Secretary of State John F. Kerry said Mon...
Name: text, dtype: object

In [53]:
MNB_classifier.predict(hash_train[[2]])

array(['REAL'], dtype='<U4')

In [55]:
PAC_classifier.predict(hash_train[[2]])

array(['REAL'], dtype='<U4')

In [68]:
logisticReg_classifier.predict(hash_train[[2]])

array(['REAL'], dtype=object)