In [None]:
import numpy as p
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
print(stopwords.words('english'))

In [None]:
## DATA PREPROCESSING

news_df = pd.read_csv(r'C:\ML fake-news-prediction\Fake-News-detection\dataset\fake_news_dataset.csv')
news_df.shape

In [None]:
news_df.head(3)

In [None]:
### COUNT THE NO.OF MISSING VALUES IN ALL DATASET

news_df.isnull().sum()

In [None]:
news_df = news_df.fillna('')

In [None]:
news_df['content'] = news_df['author'] + '' + news_df['title']
print(news_df['content'])

In [None]:
## Seperate labels and features

X = news_df.drop(columns='label' , axis=1)
Y = news_df['label']

In [None]:
print(X.columns)

In [None]:
print(X.content)

In [None]:
print(Y)

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]' , '', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ''.join(stemmed_content)
    return stemmed_content

In [None]:
news_df['content'] = news_df['content'].apply(stemming)
print(news_df['content'])

In [None]:
## Let's use label encoder to convert string categorical data to numerical data

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
news_df['label'] = le.fit_transform(news_df['label'])
news_df.head(2)

In [None]:
X = news_df['content'].values
Y = news_df['label'].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
Y.shape

In [None]:
## Converting text ata to numerical

vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
print(X)

In [None]:
## splitting the training and testing data 

X_train , X_test , Y_train , Y_test = train_test_split( X , Y , test_size=0.2 , stratify = Y , random_state=2)


In [None]:
X_train.shape

In [None]:
Y_train.shape

In [None]:
## Training the model 

log_reg_model = LogisticRegression()
log_reg_model.fit(X_train , Y_train)

In [None]:
## Evaluation on training data

X_train_prediction = log_reg_model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction , Y_train)
print('Accuracy score of the training data :' , training_data_accuracy)

In [None]:
## Evaluation on test data

X_test_prediction = log_reg_model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction , Y_test)

In [None]:
print('Accuracy of test data :' , test_data_accuracy)

In [None]:
### SEEMS LIKE OVER FITTING

In [None]:
def preprocessing(text):
    text = re.sub('[^a-zA-Z]' , ' ' , text)
    text= text.lower().split()
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text if word not in stopwords.words('english')]
    return ''.join(text)

news_df['content'] = (news_df['author'] + ' ' + news_df['title']).fillna('').apply(preprocessing)


In [None]:
news_df['content'].head(3)

In [None]:
### USE TF-IDF WITH BETTER SETTINGS

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 2))),
    ('clf', LogisticRegressionCV(cv=5, penalty='l2', solver='liblinear', max_iter=1000))
])

In [None]:
### TRAIN - TEST - SPLIT and model fitting

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = news_df['content']
Y = news_df['label'].map({'real':1 , 'fake' : 0})

X_train , X_test , y_train , y_test = train_test_split( X , Y , test_size=0.2 , stratify=Y , random_state=42)
pipeline.fit(X_train , y_train)

train_accuracy = pipeline.score(X_train , y_train)
test_accuracy = pipeline.score(X_test , y_test)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [None]:
### EVALUATE our model

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(y_train)

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred_test))

In [None]:
### (OPTIONAL) VIEW CONFUSION MATRIX

import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test , y_pred_test)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()