# 1: Importing Modules

In [137]:
import pandas as pd      # For data handling
import numpy as np       # For numerical operations
from sklearn.linear_model import LogisticRegression   # The classifier
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text to numbers
from sklearn.metrics import accuracy_score, classification_report  # For evaluating the model
import joblib
from flask import Flask, request, jsonify



# 2:  Data Collection

In [138]:
Fake_data = pd.read_csv("Fake.csv",on_bad_lines="skip", engine='python')
True_data = pd.read_csv("True.csv",on_bad_lines="skip", engine='python')
df = pd.read_csv("fake_or_real_news.csv")
Fake_data["label"] = 0
True_data["label"] = 1
df["label"] = df["label"].apply(lambda x: 1 if x == "REAL" else 0)

# 3: Droping Unnecessary Columns

In [139]:
# Drop unused columns if present
Fake_data.drop(["title", "subject", "date"], axis=1, inplace=True, errors='ignore')
True_data.drop(["title", "subject", "date"], axis=1, inplace=True, errors='ignore')
df.drop(["title", "id"], axis=1, inplace=True, errors='ignore')

# 4: Convert text to lowercase

In [140]:
Fake_data["text"] = Fake_data["text"].str.lower()
True_data["text"] = True_data["text"].str.lower()
df["text"] = df["text"].str.lower()


# 5: Combining All 3 datasets

In [141]:
Combined_dataset = pd.concat([Fake_data, True_data, df])
Combined_dataset = Combined_dataset.sample(frac=1).reset_index(drop=True)

In [150]:
Combined_dataset.head(50)

Unnamed: 0,text,label
0,a uniformed police officer in virginia said he...,0
1,,0
2,tallinn (reuters) - the european union should ...,1
3,21st century wire says russia hack? still no e...,0
4,bush spoke to a hispanic audience in central f...,0
5,rush limbaugh nails it in the transcript below...,0
6,youtube celebrity adam saleh has accused delta...,0
7,you re gonna love this patriot! he speaks for ...,0
8,san francisco (reuters) - a federal appeals co...,1
9,new york (reuters) - new york’s top democratic...,1


# 6: Checking Result

In [143]:
Combined_dataset.head()
Combined_dataset['label'].value_counts()

label
0    26645
1    24588
Name: count, dtype: int64

# 7: Training And Evaluting Model

In [144]:
X_train,X_test,Y_train,Y_test = train_test_split(Combined_dataset["text"],Combined_dataset["label"],test_size = 0.2,random_state=42)

In [145]:
# A different approach to TF-IDF vectorization
# We can try different parameters for TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7,ngram_range=(1,2))

tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)



In [146]:
model = LogisticRegression(class_weight="balanced")
model.fit(tfidf_train,Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,100


In [147]:
y_pred = model.predict(tfidf_test)
print("Accuracy : ", accuracy_score(Y_test,y_pred))
print(classification_report(Y_test, y_pred))

Accuracy :  0.9635015126378452
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      5354
           1       0.96      0.96      0.96      4893

    accuracy                           0.96     10247
   macro avg       0.96      0.96      0.96     10247
weighted avg       0.96      0.96      0.96     10247



In [148]:
sample = ["TOKYO (Reuters) - Japan's economy grew at an annualized rate of 0.5 percent in the third quarter, revised data showed on Monday, a sharp slowdown from the previous quarter's growth and weaker than a preliminary estimate."]
sample_tfidf = tfidf.transform(sample)
prediction = model.predict(sample_tfidf)
print("Fake" if prediction[0] == 0 else "Real")

Real


In [149]:

# Save the trained model and vectorizer
joblib.dump(model, 'model.pkl')
joblib.dump(tfidf, 'tfidf.pkl')


['tfidf.pkl']