In [1]:
import pandas as pd

In [2]:
# PreProcessed dataset reading
ds = pd.read_csv('PreProcessedDS.csv')
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42384 entries, 0 to 42383
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        42331 non-null  object
 1   label       42384 non-null  int64 
 2   clean_news  42328 non-null  object
dtypes: int64(1), object(2)
memory usage: 993.5+ KB


In [3]:
# Removing NaN values based on clean_news cols b'cuz it's main col who has training data
ds.dropna(subset=['clean_news'], ignore_index=True, inplace=True)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42328 entries, 0 to 42327
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        42328 non-null  object
 1   label       42328 non-null  int64 
 2   clean_news  42328 non-null  object
dtypes: int64(1), object(2)
memory usage: 992.2+ KB


In [4]:
# drop text col b'cuz we cleaned the data inside text col and save it in clean_news col
ds = ds.drop(labels='text', axis=1)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42328 entries, 0 to 42327
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       42328 non-null  int64 
 1   clean_news  42328 non-null  object
dtypes: int64(1), object(1)
memory usage: 661.5+ KB


# Word Embedding

In [5]:
# use Term Frequency - Inverse Document Frequency Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
tv = TfidfVectorizer()

In [28]:
# Transform String data into Numeric form
dfv = tv.fit_transform(ds['clean_news'])

In [8]:
dfv[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 172 stored elements and shape (1, 215354)>

In [29]:
dfv.shape

(42328, 215354)

In [10]:
# total unique words
len(tv.vocabulary_)

215354

# input spliting

In [11]:
# Splitting dataset into Train and Test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dfv, ds['label'], random_state=42, test_size=0.2, stratify=ds['label'])

In [12]:
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(33862, 215354) (8466, 215354)
(33862,) (8466,)


In [13]:
type(x_train)

scipy.sparse._csr.csr_matrix

# MLP Model building

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [15]:
# Create an MLPClassifier model
mlp = MLPClassifier(hidden_layer_sizes=(64,32),
                    max_iter=200, random_state=20, solver='adam', activation='relu', n_iter_no_change=100)

In [16]:
# Train the model on the training data
mlp.fit(x_train, y_train)



In [17]:
# Make predictions on the test data
y_pred = mlp.predict(x_test)
 
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


In [18]:
# Make predictions on the train data
y_train_pred = mlp.predict(x_train)
 
# Calculate the accuracy of the model
accuracy = accuracy_score(y_train, y_train_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [19]:
# Generate a classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)
# precision - measures the accuracy of the positive predictions.
# recall(sensetivity) - measures the ability of the model to find all the relevant cases within a dataset

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4227
           1       0.99      0.99      0.99      4239

    accuracy                           0.99      8466
   macro avg       0.99      0.99      0.99      8466
weighted avg       0.99      0.99      0.99      8466



In [30]:
ds.head()

Unnamed: 0,label,clean_news
0,0,donald trump wish american happy new year leav...
1,0,house intelligence committee chairman devin nu...
2,0,friday revealed former milwaukee sheriff david...
3,0,christmas day donald trump announced would bac...
4,0,pope francis used annual christmas day message...


In [31]:
fake_news = ds[ds['label']==0]
true_news = ds[ds['label']==1]
print(fake_news['label'].value_counts())
print(true_news['label'].value_counts())

label
0    21136
Name: count, dtype: int64
label
1    21192
Name: count, dtype: int64


In [32]:
# Transform String data into Numeric form
dfv_fake = tv.transform(fake_news['clean_news'])
dfv_true = tv.transform(true_news['clean_news'])

In [33]:
dfv_fake.shape, dfv_true.shape

((21136, 215354), (21192, 215354))

In [34]:
# Predicting Output
fake_news_predict = mlp.predict(dfv_fake)
true_news_predict = mlp.predict(dfv_true)

In [38]:
# Accuracy Score For Fake News
accuracy_score(fake_news['label'], fake_news_predict)

0.9983913701741105

In [37]:
# Accuracy Score For True News
accuracy_score(true_news['label'], true_news_predict)

0.9984899962249906

In [43]:
# Saving Model
import joblib
joblib.dump(mlp, 'Fake_News_Detection_MLPClassifier_Model.pkl')

['Fake_News_Detection_MLPClassifier_Model.pkl']

In [45]:
# To Load Model
import joblib
model = joblib.load('Fake_News_Detection_MLPClassifier_Model.pkl')