In [1]:
import pandas as pd
import numpy as np
import joblib 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [3]:
df=pd.read_csv('news.csv', index_col=None)

In [5]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [7]:
dataset = df.drop('Unnamed: 0',axis=1)

In [9]:
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [11]:
y=dataset["label"]

In [13]:
x_train, x_test, y_train, y_test = train_test_split(dataset['text'], y, test_size=0.33, random_state=53)

### TF-IDF vector

In [16]:
Tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = Tfidf_vectorizer.fit_transform(x_train)
print(tfidf_train)
tfidf_test = Tfidf_vectorizer.transform(x_test)

  (1, 42470)	0.07711040274149526
  (1, 12105)	0.15008066461476866
  (1, 54177)	0.13782629144711137
  (1, 50628)	0.061296988343109586
  (1, 15924)	0.3479045460649079
  (1, 44520)	0.4973826512693341
  (1, 51896)	0.11596517664605868
  (1, 35783)	0.30902690818827977
  (1, 35256)	0.12628385718450857
  (1, 21881)	0.21271688045815978
  (1, 42534)	0.06081715886809217
  (1, 8399)	0.08729542880625335
  (1, 29531)	0.1454406205718245
  (1, 15927)	0.4973826512693341
  (1, 25686)	0.13550453594288983
  (1, 49203)	0.1672740861784377
  (1, 16814)	0.10404977746548139
  (1, 36087)	0.12648679854389897
  (1, 21568)	0.1007920919566398
  (1, 25684)	0.1030420922189754
  (1, 38823)	0.06048803110658644
  (1, 47506)	0.14539060877460044
  (1, 36831)	0.10772488937433067
  (2, 16972)	0.1606296088662543
  (2, 762)	0.48803966069171073
  :	:
  (4243, 41435)	0.02969665315895183
  (4243, 53607)	0.044665186536595916
  (4243, 659)	0.04293180970016178
  (4243, 38834)	0.037049324915825195
  (4243, 19003)	0.03443753105034290

In [17]:
len(Tfidf_vectorizer.get_feature_names_out())

56922

In [18]:
print(tfidf_train.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Train ML models

In [23]:
# Dictionary to store model results
results = {}

# Multinomial Naive Bayes
MNB_classifier = MultinomialNB()
MNB_classifier.fit(tfidf_train, y_train)
MNB_pred = MNB_classifier.predict(tfidf_test)
MNB_accuracy = metrics.accuracy_score(y_test, MNB_pred)
print(f"MultinomialNB Classifier Accuracy: {MNB_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, MNB_pred, labels=['FAKE', 'REAL'])
results["MultinomialNB"] = (MNB_accuracy, MNB_classifier)

# Passive Aggressive
PAC_classifier = PassiveAggressiveClassifier(max_iter=1000, random_state=42)
PAC_classifier.fit(tfidf_train, y_train)
PAC_pred = PAC_classifier.predict(tfidf_test)
PAC_accuracy = metrics.accuracy_score(y_test, PAC_pred)
print(f"Passive Aggressive Classifier Accuracy: {PAC_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, PAC_pred, labels=['FAKE', 'REAL'])
results["PassiveAggressive"] = (PAC_accuracy, PAC_classifier)

# Logistic Regression
logisticReg_classifier = LogisticRegression()
logisticReg_classifier.fit(tfidf_train, y_train)
logistic_pred = logisticReg_classifier.predict(tfidf_test)
logistic_accuracy = metrics.accuracy_score(y_test, logistic_pred)
print(f"Logistic Regressor Accuracy: {logistic_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, logistic_pred, labels=['FAKE', 'REAL'])
results["LogisticRegression"] = (logistic_accuracy, logisticReg_classifier)

# Random Forest
random_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_classifier.fit(tfidf_train, y_train)
random_pred = random_classifier.predict(tfidf_test)
random_accuracy = metrics.accuracy_score(y_test, random_pred)
print(f"Random Forest Accuracy: {random_accuracy}")
#confusion matrix
cm_logistic = metrics.confusion_matrix(y_test, random_pred, labels=['FAKE', 'REAL'])
results["RandomForest"] = (random_accuracy, random_classifier)


MultinomialNB Classifier Accuracy: 0.8565279770444764
Passive Aggressive Classifier Accuracy: 0.937350549976088
Logistic Regressor Accuracy: 0.9134385461501674
Random Forest Accuracy: 0.8971783835485414


### Classification Report

In [25]:
#classification report for MultipolynomialNB
from sklearn.metrics import classification_report
print("MultipolynomialNB classification report")
print("***************************************")
MNB_report=classification_report(y_test, MNB_pred)
print(MNB_report)

#classification report for Passive Aggressive Classifier
from sklearn.metrics import classification_report
print("Passive Aggressive Classifier classification report")
print("***************************************************")
PAC_report=classification_report(y_test, PAC_pred)
print(PAC_report)

#classification report for Logistic classification
from sklearn.metrics import classification_report
print("Logistic Regressor classification report")
print("****************************************")
logistic_report=classification_report(y_test, logistic_pred)
print(logistic_report)

#classification report for Random Forest classification
from sklearn.metrics import classification_report
print("Random Forest classification report")
print("****************************************")
random_report=classification_report(y_test, random_pred)
print(random_report)

MultipolynomialNB classification report
***************************************
              precision    recall  f1-score   support

        FAKE       0.96      0.73      0.83      1008
        REAL       0.80      0.97      0.88      1083

    accuracy                           0.86      2091
   macro avg       0.88      0.85      0.85      2091
weighted avg       0.88      0.86      0.85      2091

Passive Aggressive Classifier classification report
***************************************************
              precision    recall  f1-score   support

        FAKE       0.92      0.95      0.94      1008
        REAL       0.95      0.93      0.94      1083

    accuracy                           0.94      2091
   macro avg       0.94      0.94      0.94      2091
weighted avg       0.94      0.94      0.94      2091

Logistic Regressor classification report
****************************************
              precision    recall  f1-score   support

        FAKE       0.88  

In [26]:
best_model_name = max(results, key=lambda k: results[k][0])
best_accuracy, best_model = results[best_model_name]

print(f"Best Model: {best_model_name} with accuracy {best_accuracy:.3f}")

Best Model: PassiveAggressive with accuracy 0.937


In [30]:
results[best_model_name]

(0.937350549976088, PassiveAggressiveClassifier(random_state=42))

### Save the best Model

In [33]:
joblib.dump(best_model, "TF-IDF_bestmodel.pkl")
print(f"Saved {best_model_name} to disk.")

Saved PassiveAggressive to disk.


### Load the saved best model

In [36]:
loaded_model = joblib.load("bestmodel.pkl")
predictions = loaded_model.predict(tfidf_train[[0]])

In [38]:
dataset["text"][0]



In [40]:
predictions

array(['FAKE'], dtype='<U4')