In [None]:
!pip install scikit-learn pandas numpy nltk matplotlib seaborn tensorflow flask-ngrok


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

# Step 1: Load Dataset
from google.colab import files
uploaded = files.upload()

# Assuming the dataset is in "Fake.csv" and "True.csv"
df_fake = pd.read_csv("Fake.csv")
df_real = pd.read_csv("True.csv")

# Add Labels to Datasets
df_fake['label'] = 0  # Fake news = 0
df_real['label'] = 1  # Real news = 1

# Combine Datasets
df = pd.concat([df_fake, df_real]).reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Step 2: Preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply Cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['label']

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Building and Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6: Model Evaluation
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC-AUC Score
roc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"ROC-AUC Score: {roc_score}")

# Step 7: Save the Model and Vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Step 8: Flask API
app = Flask(__name__)
run_with_ngrok(app)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data['text']

    # Preprocess and vectorize
    cleaned_text = clean_text(text)
    input_vector = vectorizer.transform([cleaned_text]).toarray()

    # Predict
    prediction = model.predict(input_vector)
    prob = model.predict_proba(input_vector)[0]

    return jsonify({
        'prediction': 'REAL' if prediction[0] == 1 else 'FAKE',
        'probability': {
            'Fake': prob[0],
            'Real': prob[1]
        }
    })

if __name__ == '__main__':
    app.run()


KeyboardInterrupt: 

In [None]:
from google.colab import files

# Upload both files
uploaded = files.upload()

# After uploading, the files will be available as a dictionary
for filename in uploaded.keys():
    print(f"Uploaded file: {filename}")


Saving Fake.csv to Fake (1).csv
Saving True - Copy.csv to True - Copy.csv
Uploaded file: Fake (1).csv
Uploaded file: True - Copy.csv


In [None]:
import os

# Verify files are uploaded
for filename in uploaded.keys():
    if os.path.exists(filename):
        print(f"File {filename} exists in the current directory")
    else:
        print(f"File {filename} not found")


File Fake (1).csv exists in the current directory
File True - Copy.csv exists in the current directory


In [None]:
# Import necessary libraries
import pandas as pd
from google.colab import files
import os

# Upload the CSV files (Fake and True news datasets)
uploaded = files.upload()

# Verify that both files are uploaded
print(os.listdir())  # This will list all files in the current directory

# Now load the CSV files (change the filenames if necessary)
df_fake = pd.read_csv('Fake.csv')
df_real = pd.read_csv('True.csv')

# Print the first few rows of both datasets to inspect them
print("Fake news dataset:")
print(df_fake.head())
print("\nReal news dataset:")
print(df_real.head())

# Check the structure of the columns to ensure the data is loaded correctly
print("\nFake news columns:")
print(df_fake.columns)
print("\nReal news columns:")
print(df_real.columns)

# For simplicity, assuming the text column in both datasets is named 'text'
# You can modify it based on the actual column names

# Add a 'label' column to each dataset: 0 for fake news, 1 for real news
df_fake['label'] = 0  # Fake news gets a label of 0
df_real['label'] = 1  # Real news gets a label of 1

# Combine the two datasets into one
df = pd.concat([df_fake[['text', 'label']], df_real[['text', 'label']]])

# Shuffle the dataset to randomize the order
df = df.sample(frac=1).reset_index(drop=True)

# Check the combined and shuffled dataset
print("\nCombined and shuffled dataset:")
print(df.head())


Saving Fake.csv to Fake (2).csv
Saving True - Copy.csv to True - Copy (1).csv
['.config', 'True - Copy (1).csv', 'Fake (1).csv', 'Fake.csv', 'True - Copy.csv', 'Fake (2).csv', 'sample_data']


  df_fake = pd.read_csv('Fake.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'True.csv'

In [None]:
pip install pandas numpy




In [None]:
import pandas as pd
import os

# Step 1: Check the Current Working Directory
print("Current Working Directory:", os.getcwd())

# Step 2: If the files are not in the current directory, move them or use full paths
# For local files (adjust the paths accordingly)
fake_file_path = 'Fake.csv'
real_file_path = 'True.csv'

# For Google Colab
try:
    from google.colab import files
    uploaded = files.upload()
    fake_file_path = 'Fake.csv'
    real_file_path = 'True.csv'
except ImportError:
    print("Not running in Google Colab. Ensure files are in the specified paths.")

# Step 3: Load Datasets
try:
    df_fake = pd.read_csv(fake_file_path, low_memory=False)
    df_real = pd.read_csv(real_file_path, low_memory=False)
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Ensure the files exist in the correct directory.")
    exit()

# Step 4: Keep Only Relevant Columns
df_fake = df_fake.loc[:, ['title', 'text']]  # Adjust these column names if needed
df_real = df_real.loc[:, ['title', 'text']]

# Step 5: Add Labels
df_fake['label'] = 0  # Fake news
df_real['label'] = 1  # Real news

# Step 6: Combine Datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Step 7: Shuffle the Combined Dataset
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Step 8: Save the Combined Dataset
combined_file_path = 'Combined_News.csv'
df_combined.to_csv(combined_file_path, index=False)
print(f"Combined dataset saved as '{combined_file_path}'.")

# Step 9: Confirm Data Saved
print("Combined Dataset Preview:")
print(df_combined.head())


Current Working Directory: /content


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv
Files loaded successfully!
Combined dataset saved as 'Combined_News.csv'.
Combined Dataset Preview:
  title text  label
0   NaN  NaN      1
1   NaN  NaN      1
2   NaN  NaN      0
3   NaN  NaN      0
4   NaN  NaN      1


In [None]:
import pandas as pd

# Load datasets
df_fake = pd.read_csv('Fake.csv', low_memory=False)
df_real = pd.read_csv('True.csv', low_memory=False)

# Display columns to verify structure
print("Fake News Dataset Columns:")
print(df_fake.columns)

print("\nReal News Dataset Columns:")
print(df_real.columns)

# Select and clean relevant columns (update based on your dataset structure)
df_fake = df_fake[['title', 'text']].dropna()  # Replace with actual column names
df_real = df_real[['title', 'text']].dropna()

# Add labels
df_fake['label'] = 0
df_real['label'] = 1

# Combine datasets
df_combined = pd.concat([df_fake, df_real], ignore_index=True)

# Shuffle dataset
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Save combined dataset
df_combined.to_csv('Combined_News.csv', index=False)
print("\nCombined dataset saved as 'Combined_News.csv'.")
print("\nCombined Dataset Preview:")
print(df_combined.head())


Fake News Dataset Columns:
Index(['title', 'text', 'subject', 'date', 'Unnamed: 4', 'Unnamed: 5',
       'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       ...
       'Unnamed: 162', 'Unnamed: 163', 'Unnamed: 164', 'Unnamed: 165',
       'Unnamed: 166', 'Unnamed: 167', 'Unnamed: 168', 'Unnamed: 169',
       'Unnamed: 170', 'Unnamed: 171'],
      dtype='object', length=172)

Real News Dataset Columns:
Index(['title', 'text', 'subject', 'date'], dtype='object')

Combined dataset saved as 'Combined_News.csv'.

Combined Dataset Preview:
                                               title  \
0  Corporate AMT likely will not be in final U.S....   
1  Pelosi says Democrats will not back short-term...   
2   Just Back From A Golfing Vacay, Trump Says He...   
3  Republican Senator Rubio will back tax bill: C...   
4   Trump: Everything You Heard Me Say On The Acc...   

                                                text  label  
0  WASHINGTON (Reuters) - The chairman of the U.S.

In [None]:
df_test = pd.read_csv('Combined_News.csv')
print(df_test.head())



                                               title  \
0  Corporate AMT likely will not be in final U.S....   
1  Pelosi says Democrats will not back short-term...   
2   Just Back From A Golfing Vacay, Trump Says He...   
3  Republican Senator Rubio will back tax bill: C...   
4   Trump: Everything You Heard Me Say On The Acc...   

                                                text  label  
0  WASHINGTON (Reuters) - The chairman of the U.S...      1  
1  WASHINGTON (Reuters) - U.S. House of Represent...      1  
2  Donald Trump just got back from binge-golfing ...      0  
3  WASHINGTON (Reuters) - U.S. Republican Senator...      1  
4  Former reality show star Donald Trump has repe...      0  


In [None]:
print(df_combined['label'].value_counts())


label
1    399
0    399
Name: count, dtype: int64


In [None]:
print(df_combined.isnull().sum())


title    0
text     0
label    0
dtype: int64


In [None]:
df_fake = pd.read_csv('Fake.csv', low_memory=False)
print(df_fake.head())

df_real = pd.read_csv('True.csv', low_memory=False)
print(df_real.head())


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject       date  \
0  Donald Trump just couldn t wish all Americans ...    News  31-Dec-17   
1  House Intelligence Committee Chairman Devin Nu...    News  31-Dec-17   
2  On Friday, it was revealed that former Milwauk...    News  30-Dec-17   
3  On Christmas day, Donald Trump announced that ...    News  29-Dec-17   
4  Pope Francis used his annual Christmas Day mes...    News  25-Dec-17   

  Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9  ...  \
0        NaN        NaN        NaN        NaN        NaN        NaN  ...   
1        NaN        NaN        NaN        NaN        NaN    

In [None]:
import pandas as pd

# Load the fake CSV and true CSV files
fake_df = pd.read_csv('fake.csv')
true_df = pd.read_csv('true.csv')

# Rename the columns of the fake CSV to match the true CSV's column names
fake_df.columns = true_df.columns

# Concatenate the two DataFrames into one
combined_df = pd.concat([true_df, fake_df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('combined.csv', index=False)

print("Files combined and saved as 'combined.csv'")


FileNotFoundError: [Errno 2] No such file or directory: 'fake.csv'