In [8]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression



**Importing DataSets**

In [9]:
fake = pd.read_csv(r"Fake.csv")
true = pd.read_csv(r"True.csv")

In [10]:
fake['class'] = 0
true['class'] = 1


In [11]:
print(fake.shape,true.shape)

(23481, 5) (21417, 5)


In [12]:
df_merge = pd.concat([fake,true],axis=0)
df_merge.shape

(44898, 5)

In [13]:
print(df_merge.columns)
df = df_merge.drop(['title','subject','date'],axis=1)

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')


**Sampling** 

In [14]:
df = df.sample(frac=1)

In [15]:
df.reset_index(inplace = True)
df.drop(['index'],axis=1,inplace = True)
df.head()

Unnamed: 0,text,class
0,NEW YORK (Reuters) - New York City Mayor Bill ...,1
1,Dr. Gina Loudon went on CNN today to discuss t...,0
2,SKOPJE (Reuters) - A Macedonian court sentence...,1
3,"ASPEN, Colo. (Reuters) - National Security Age...",1
4,Donald Trump s executive order barring people ...,0


**Missing Values**

In [16]:
df.isna().sum()

text     0
class    0
dtype: int64

**Text Preprocessing**

* converting words into lowercase

In [17]:
df['text'] = df['text'].str.lower()

* removing leading and trailing whitespaces

In [18]:
df['text'] = df['text'].str.strip()

* removing puctuations

In [19]:
df['text'] = df['text'].str.replace('[^a-zA-Z]',' ')

  df['text'] = df['text'].str.replace('[^a-zA-Z]',' ')


**Defining dependent and independent variables**

In [20]:
X = df['text']
y = df['class']

**Splitting Training and Testing**

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state =42)

**Convert text to vectors** 
* CountVectorizer

In [22]:
cv = CountVectorizer(stop_words='english')
count_train= cv.fit_transform(X_train)
count_test = cv.transform(X_test)
print(cv.get_feature_names_out()[:10])

['aa' 'aaa' 'aaaaaaaand' 'aaaaapkfhk' 'aaaahhhh' 'aaaand' 'aaab' 'aab'
 'aabfsv' 'aabge']


**Convert text to vectors** 
* TfidfVectorizer

In [23]:
tv = TfidfVectorizer(stop_words='english',max_df = 0.9)
tfidf_train= tv.fit_transform(X_train)
tfidf_test = tv.transform(X_test)
print(tv.get_feature_names_out()[:10])

['aa' 'aaa' 'aaaaaaaand' 'aaaaapkfhk' 'aaaahhhh' 'aaaand' 'aaab' 'aab'
 'aabfsv' 'aabge']


**Multinomial Naive Bayes classifier**

In [24]:
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train,y_train)
y_pred = nb_classifier.predict(count_test)
print(confusion_matrix(y_test,y_pred))

print(accuracy_score(y_test,y_pred))


[[5530  301]
 [ 246 5148]]
0.9512694877505568


In [25]:
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train,y_train)
y_pred_td = nb_classifier.predict(tfidf_test)
print(confusion_matrix(y_test,y_pred_td))

print(accuracy_score(y_test,y_pred_td))

[[5479  352]
 [ 348 5046]]
0.9376391982182628


**LogisticRegression**

In [26]:
lr_classifier = LogisticRegression()
lr_classifier.fit(count_train,y_train)
y_pred = lr_classifier.predict(count_test)
print(confusion_matrix(y_test,y_pred))

print(accuracy_score(y_test,y_pred))


[[5802   29]
 [  23 5371]]
0.9953674832962138


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
lr_classifier = LogisticRegression()
lr_classifier.fit(tfidf_train,y_train)
y_pred = lr_classifier.predict(tfidf_test)
print(confusion_matrix(y_test,y_pred))

print(accuracy_score(y_test,y_pred))


[[5730  101]
 [  52 5342]]
0.986369710467706
