In [46]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression



**Importing DataSets**

In [47]:
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [48]:
fake['class'] = 0
true['class'] = 1


In [49]:
print(fake.shape,true.shape)

In [50]:
df_merge = pd.concat([fake,true],axis=0)
df_merge.shape

In [51]:
print(df_merge.columns)
df = df_merge.drop(['title','subject','date'],axis=1)

**Sampling** 

In [52]:
df = df.sample(frac=1)

In [53]:
df.reset_index(inplace = True)
df.drop(['index'],axis=1,inplace = True)
df.head()

**Missing Values**

In [54]:
df.isna().sum()

**Text Preprocessing**

* converting words into lowercase

In [55]:
df['text'] = df['text'].str.lower()

* removing leading and trailing whitespaces

In [56]:
df['text'] = df['text'].str.strip()

* removing puctuations

In [57]:
df['text'] = df['text'].str.replace('[^a-zA-Z]',' ')

**Defining dependent and independent variables**

In [58]:
X = df['text']
y = df['class']

**Splitting Training and Testing**

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state =42)

**Convert text to vectors** 
* CountVectorizer

In [60]:
cv = CountVectorizer(stop_words='english')
count_train= cv.fit_transform(X_train)
count_test = cv.transform(X_test)
print(cv.get_feature_names_out()[:10])

**Convert text to vectors** 
* TfidfVectorizer

In [61]:
tv = TfidfVectorizer(stop_words='english',max_df = 0.9)
tfidf_train= tv.fit_transform(X_train)
tfidf_test = tv.transform(X_test)
print(tv.get_feature_names_out()[:10])

**Multinomial Naive Bayes classifier**

In [62]:
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train,y_train)
y_pred = nb_classifier.predict(count_test)
print(confusion_matrix(y_test,y_pred))

print(accuracy_score(y_test,y_pred))


In [63]:
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train,y_train)
y_pred_td = nb_classifier.predict(tfidf_test)
print(confusion_matrix(y_test,y_pred_td))

print(accuracy_score(y_test,y_pred_td))

**LogisticRegression**

In [64]:
lr_classifier = LogisticRegression()
lr_classifier.fit(count_train,y_train)
y_pred = lr_classifier.predict(count_test)
print(confusion_matrix(y_test,y_pred))

print(accuracy_score(y_test,y_pred))


In [65]:
lr_classifier = LogisticRegression()
lr_classifier.fit(tfidf_train,y_train)
y_pred = lr_classifier.predict(tfidf_test)
print(confusion_matrix(y_test,y_pred))

print(accuracy_score(y_test,y_pred))
