<a href="https://colab.research.google.com/github/Lakshmi12344/Fake-News-Detection/blob/main/Fake_News_Detection(Project).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Example dataset loading
data = pd.read_csv('/content/english_test_with_labels.csv')
data

Unnamed: 0,id,tweet,label
0,1,Our daily update is published. States reported...,real
1,2,Alfalfa is the only cure for COVID-19.,fake
2,3,President Trump Asked What He Would Do If He W...,fake
3,4,States reported 630 deaths. We are still seein...,real
4,5,This is the sixth time a global health emergen...,real
...,...,...,...
2135,2136,#CoronaVirusUpdates: State-wise details of Tot...,real
2136,2137,Tonight 12(midnight) onwards Disaster Manageme...,fake
2137,2138,296 new cases of #COVID19Nigeria; Plateau-85 E...,real
2138,2139,RT @CDCemergency: #DYK? @CDCgov’s One-Stop Sho...,real


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab') # This line is added to download the necessary resource.

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['cleaned_text'] = data['tweet'].apply(preprocess)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['cleaned_text'])
X

<2140x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 30089 stored elements in Compressed Sparse Row format>

In [None]:
!pip install textblob
import textblob
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['sentiment'] = data['cleaned_text'].apply(get_sentiment)
X_sentiment = data['sentiment'].values.reshape(-1, 1)




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# Combine TF-IDF and sentiment features
X_combined = hstack([X, X_sentiment])

# Label encoding
data['label'] = data['label'].map({'real': 0, 'fake': 1})

X_train, X_test, y_train, y_test = train_test_split(X_combined, data['label'], test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, y_pred_rf))

# Support Vector Machine
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("Support Vector Machine:\n", classification_report(y_test, y_pred_svm))



Logistic Regression:
               precision    recall  f1-score   support

           0       0.90      0.87      0.89       235
           1       0.85      0.89      0.87       193

    accuracy                           0.88       428
   macro avg       0.88      0.88      0.88       428
weighted avg       0.88      0.88      0.88       428

Random Forest:
               precision    recall  f1-score   support

           0       0.93      0.84      0.88       235
           1       0.83      0.92      0.87       193

    accuracy                           0.88       428
   macro avg       0.88      0.88      0.88       428
weighted avg       0.88      0.88      0.88       428

Support Vector Machine:
               precision    recall  f1-score   support

           0       0.92      0.87      0.89       235
           1       0.85      0.91      0.88       193

    accuracy                           0.89       428
   macro avg       0.89      0.89      0.89       428
weighted av

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(lr, X_combined, data['label'], cv=5)
print("Logistic Regression Cross-validation scores: ", scores)


Logistic Regression Cross-validation scores:  [0.88317757 0.88084112 0.91121495 0.89719626 0.87616822]


In [None]:
import joblib

joblib.dump(lr, 'logistic_regression_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    text = data['text']
    clean_text = preprocess(text)
    vect_text = vectorizer.transform([clean_text])
    prediction = model.predict(vect_text)
    return jsonify({'prediction': int(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)



 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
from flask import Flask, request, jsonify
import joblib

app = Flask(__name__)

# Load model and vectorizer
model = joblib.load('logistic_regression_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get JSON data from POST request
        data = request.json
        text = data['text']

        # Preprocess the input text
        preprocessed_text = preprocess(text)

        # Convert the preprocessed text into feature vector
        features = vectorizer.transform([preprocessed_text])

        # Predict the class (real/fake) using the trained model
        prediction = model.predict(features)

        # Return the prediction in JSON format
        return jsonify({'prediction': 'real' if prediction[0] == 0 else 'fake'})

    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True)


In [None]:
# @title Default title text
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [None]:

# Load the dataset
file_path = '/content/english_test_with_labels.csv'
data = pd.read_csv(file_path)



In [None]:
# Step 1: Inspect the dataset
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
print(data.info())
print("\nClass Distribution:")
print(data['label'].value_counts())



In [None]:
# Step 2: Splitting the dataset into features and labels
X = data['tweet']
y = data['label']



In [None]:
# Step 5: Model training using Logistic Regression
print("\nTraining the Logistic Regression model...")
model = LogisticRegression(random_state=42)
model.fit(X_train_tfidf, y_train)
print("Model training complete.")



In [None]:
# Step 6: Predictions
print("\nEvaluating the model...")
y_pred = model.predict(X_test_tfidf)



In [None]:
# Step 7: Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

# Step 8: Classification Report
classification_rep = classification_report(y_test, y_pred)
print("\nClassification Report:\n")
print(classification_rep)



In [None]:
# Step 9: Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Visualizing Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Fake", "Real"], yticklabels=["Fake", "Real"])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()



In [None]:
# Step 10: Example Predictions
print("\nExample Predictions:")
sample_texts = [
    "COVID-19 vaccines are safe and effective.",
    "The government is hiding alien technology.",
    "CDC reports a decline in daily COVID cases.",
    "New study shows chocolate cures cancer."
]
sample_tfidf = tfidf_vectorizer.transform(sample_texts)
sample_preds = model.predict(sample_tfidf)

for text, pred in zip(sample_texts, sample_preds):
    print(f"Text: {text} -> Predicted Label: {pred}")
