In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

fake_data=pd.read_csv(r"C:\Users\navee\OneDrive\Documents\FAKE NEWS DETECTION\Fake.csv")
true_data=pd.read_csv(r"C:\Users\navee\OneDrive\Documents\FAKE NEWS DETECTION\True.csv")

fake_data.drop_duplicates()
true_data.drop_duplicates()

#preprocessing for text

fake_data['no_sc_text']=fake_data['text'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x))
true_data['no_sc_text']=true_data['text'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x))

fake_data['no_html_text'] = fake_data['no_sc_text'].apply(lambda x: re.sub(r'<.*?>', '', x))
true_data['no_html_text'] = true_data['no_sc_text'].apply(lambda x: re.sub(r'<.*?>', '', x))

stop_words = set(stopwords.words('english'))
fake_data['sw_text']  = fake_data['no_html_text'] .apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
true_data['sw_text']  = true_data['no_html_text'] .apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

true_data['news'] = true_data['sw_text'].apply(lambda x: word_tokenize(x))
fake_data['news'] = fake_data['sw_text'].apply(lambda x: word_tokenize(x))

#preprocessing for title

fake_data['title1']=fake_data['title'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]','',x))
true_data['title1']=true_data['title'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]','',x))

stop_words = set(stopwords.words('english'))
fake_data['title_']  = fake_data['title1'] .apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
true_data['title_']  = true_data['title1'] .apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

true_data['title'] = true_data['title_'].apply(lambda x: word_tokenize(x))
fake_data['title'] = fake_data['title_'].apply(lambda x: word_tokenize(x))

#label encoding subject

label_encoder=LabelEncoder()
true_data['subject']=label_encoder.fit_transform(true_data['subject'])
fake_data['subject']=label_encoder.fit_transform(fake_data['subject'])

fake_data.drop(['no_html_text','sw_text','no_sc_text','text','title1','title_'],axis=1,inplace=True)
true_data.drop(['no_html_text','sw_text','no_sc_text','text','title1','title_'],axis=1,inplace=True)

true_data['news_authenticity']=1
fake_data['news_authenticity']=0

#combining data

fkn_dataset=pd.concat([fake_data, true_data], ignore_index=True)

fkn_dataset['date']=fkn_dataset['date'].apply(lambda x:re.sub(r'[^A-Za-z0-9\s]','',x))

#to remove other text that exceeds the lenght of date

fkn_dataset['date'] = fkn_dataset['date'].astype(str)
mask = fkn_dataset['date'].apply(len) <= 18

fkn_dataset = fkn_dataset[mask]

#tf-idf 

fkn_dataset['news1'] = fkn_dataset['news'].apply(lambda tokens: ' '.join(tokens))
tfidf_vectorizer = TfidfVectorizer(max_features=500) 
X_tfidf = tfidf_vectorizer.fit_transform(fkn_dataset['news1'])

#splitting

x=X_tfidf
y=fkn_dataset['news_authenticity']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#logistic regression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

#random forest

random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)


In [15]:
#logistic regression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [2]:
#testing LR
logistic_predictions = logistic_model.predict(X_test)

In [3]:
#report
print(classification_report(y_test, logistic_predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4723
           1       0.98      0.98      0.98      4255

    accuracy                           0.98      8978
   macro avg       0.98      0.98      0.98      8978
weighted avg       0.98      0.98      0.98      8978



In [4]:
random_forest_predictions = random_forest_model.predict(X_test)

In [10]:
print(classification_report(y_test, random_forest_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4723
           1       1.00      1.00      1.00      4255

    accuracy                           1.00      8978
   macro avg       1.00      1.00      1.00      8978
weighted avg       1.00      1.00      1.00      8978



In [11]:
y_test[:10]

13735    0
34895    1
18405    0
18285    0
5199     0
11906    0
1784     0
4506     0
35626    1
18156    0
Name: news_authenticity, dtype: int64

In [None]:
logistic_predictions[:10]

In [None]:
random_forest_predictions[:10]

In [7]:
#to check presence od special characters

def has_special_characters(text):
    return bool(re.search(r'[^A-Za-z0-9\s]', text))
fake_data['no_sc_text'].apply(has_special_characters)

KeyError: 'no_sc_text'

In [77]:
##to remove other text that exceeds the lenght of date

fkn_dataset['date'] = fkn_dataset['date'].astype(str)
mask = fkn_dataset['date'].apply(len) <= 18

fkn_dataset = fkn_dataset[mask]