In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df=pd.read_csv("news.csv")

In [3]:
df.shape

(6335, 4)

In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
labels=df.label

In [6]:
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [7]:
x_train,x_test,y_train,y_test=train_test_split(df["text"],labels,test_size=0.3,random_state=7)

In [8]:
#DataFlair - Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [31]:
pac=PassiveAggressiveClassifier(max_iter=1000)

In [32]:
pac.fit(tfidf_train,y_train)

PassiveAggressiveClassifier()

In [33]:
y_pred=pac.predict(tfidf_test)

In [34]:
score=accuracy_score(y_test,y_pred)

In [35]:
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.53%


In [36]:
conma=confusion_matrix(y_test,y_pred,labels=["FAKE","REAL"])

In [37]:
conma

array([[902,  72],
       [ 70, 857]], dtype=int64)

In [38]:
pd.crosstab(y_test,y_pred)

col_0,FAKE,REAL
label,Unnamed: 1_level_1,Unnamed: 2_level_1
FAKE,902,72
REAL,70,857


In [39]:
from sklearn.metrics import f1_score
print('F1 score:', f1_score(y_test, y_pred,average='weighted'))

F1 score: 0.9253043337324658


In [45]:
from sklearn.tree import DecisionTreeClassifier

In [50]:
dt=DecisionTreeClassifier(random_state=42)
dt.fit(tfidf_train,y_train)
dt_pred= dt.predict(tfidf_test)
dt_score=accuracy_score(dt_pred,y_test)
dt_score

0.8122041031036297

In [51]:
from sklearn.ensemble import RandomForestClassifier

In [52]:
rf=RandomForestClassifier()
rf.fit(tfidf_train,y_train)
rf_pred= rf.predict(tfidf_test)
rf_score=accuracy_score(rf_pred,y_test)
rf_score

0.9016307206733298

In [54]:
y_test

3534    REAL
6265    FAKE
3123    REAL
3940    REAL
2856    REAL
        ... 
118     FAKE
3258    REAL
4521    FAKE
5926    FAKE
89      REAL
Name: label, Length: 1901, dtype: object

In [40]:
y_pred.astype("object")

array(['REAL', 'FAKE', 'REAL', ..., 'FAKE', 'FAKE', 'REAL'], dtype=object)

In [41]:
y_test.to_numpy()

array(['REAL', 'FAKE', 'REAL', ..., 'FAKE', 'FAKE', 'REAL'], dtype=object)

In [42]:
result=pd.DataFrame({'Actual':y_test,'Predicted':y_pred})

In [43]:
result

Unnamed: 0,Actual,Predicted
3534,REAL,REAL
6265,FAKE,FAKE
3123,REAL,REAL
3940,REAL,REAL
2856,REAL,REAL
...,...,...
118,FAKE,FAKE
3258,REAL,FAKE
4521,FAKE,FAKE
5926,FAKE,FAKE
