In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib

In [None]:
# Step 2: Load the datasets
data_fake = pd.read_csv('/content/Fake.csv', on_bad_lines='skip', engine='python')
data_true = pd.read_csv('/content/True.csv')
print("Datasets loaded successfully.")

Datasets loaded successfully.


In [None]:
# Step 3: Add class labels
data_fake["class"] = 0
data_true['class'] = 1
print("Class labels added.")

Class labels added.


In [None]:
# Step 4: Manually test the last 10 rows and remove them from the main dataframes
data_fake_manual_testing = data_fake.tail(10)
data_fake = data_fake[:-10] # More robust way to remove tail
data_true_manual_testing = data_true.tail(10)
data_true = data_true[:-10] # More robust way to remove tail

data_fake_manual_testing['class'] = 0
data_true_manual_testing['class'] = 1
print("Manual testing data separated.")

Manual testing data separated.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_fake_manual_testing['class'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_true_manual_testing['class'] = 1


In [None]:
# Step 5: Merge the datasets
data_merge = pd.concat([data_fake, data_true], axis=0)
print("Datasets merged.")
display(data_merge.head())

Datasets merged.


Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [None]:
# Step 6: Drop unnecessary columns
data = data_merge.drop(['title', 'subject', 'date'], axis=1)
print("Unnecessary columns dropped.")
print("Missing values:", data.isnull().sum().sum()) # Check for any missing values

Unnecessary columns dropped.
Missing values: 0


In [None]:
# Step 7: Shuffle the data and reset index
data = data.sample(frac=1).reset_index(drop=True)
print("Data shuffled and index reset.")
display(data.head())

Data shuffled and index reset.


Unnamed: 0,text,class
0,ZURICH (Reuters) - Neutral Switzerland is in t...,1
1,They laughed at us when we said @realDonaldTru...,0
2,Democrats want to spend a whopping $2 billion ...,0
3,Dinesh D Souza s Hillary s America will debu...,0
4,Sometimes all you can do is laugh at the stupi...,0


In [None]:
# Step 8: Define text preprocessing function
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W", " ", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', b'', text) # Removed b'' from here as it's not needed for string
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

print("Preprocessing function defined.")

Preprocessing function defined.


In [None]:
# Step 9: Apply preprocessing to the text column
data['text'] = data['text'].apply(wordopt)
print("Text data preprocessed.")
display(data.head())

Text data preprocessed.


Unnamed: 0,text,class
0,zurich reuters neutral switzerland is in t...,1
1,they laughed at us when we said realdonaldtru...,0
2,democrats want to spend a whopping billion o...,0
3,dinesh d souza s hillary s america will debu...,0
4,sometimes all you can do is laugh at the stupi...,0


In [None]:
# Step 10: Split data into training and testing sets
x = data['text']
y = data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) # Added random_state for reproducibility
print("Data split into training and testing sets.")
print("Training data shape:", x_train.shape)
print("Testing data shape:", x_test.shape)

Data split into training and testing sets.
Training data shape: (33658,)
Testing data shape: (11220,)


In [None]:
# Step 11: Vectorize the text data using TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)
print("Text data vectorized using TF-IDF.")

Text data vectorized using TF-IDF.


In [None]:
# Step 12: Train the models
print("Training models...")

LR = LogisticRegression()
LR.fit(xv_train, y_train)
print("Logistic Regression model trained.")

DT = DecisionTreeClassifier(random_state=42) # Added random_state
DT.fit(xv_train, y_train)
print("Decision Tree model trained.")

GB = GradientBoostingClassifier(random_state=42) # Added random_state
GB.fit(xv_train, y_train)
print("Gradient Boosting model trained.")

RF = RandomForestClassifier(random_state=42) # Added random_state
RF.fit(xv_train, y_train)
print("Random Forest model trained.")

Training models...
Logistic Regression model trained.
Decision Tree model trained.
Gradient Boosting model trained.
Random Forest model trained.


In [None]:
# Step 13: Evaluate the models (Optional, already done in original notebook)
print("\nModel Evaluation:")
print("Logistic Regression Score:", LR.score(xv_test, y_test))
print("Decision Tree Score:", DT.score(xv_test, y_test))
print("Gradient Boosting Score:", GB.score(xv_test, y_test))
print("Random Forest Score:", RF.score(xv_test, y_test))


Model Evaluation:
Logistic Regression Score: 0.9854723707664884
Decision Tree Score: 0.9953654188948307
Gradient Boosting Score: 0.9957219251336898
Random Forest Score: 0.9890374331550802


In [None]:
# Step 14: Save the models and vectorizer
joblib.dump(LR, 'logistic_regression_model.pkl')
joblib.dump(DT, 'decision_tree_model.pkl')
joblib.dump(GB, 'gradient_boosting_model.pkl')
joblib.dump(RF, 'random_forest_model.pkl')
joblib.dump(vectorization, 'tfidf_vectorizer.pkl')
print("\nModels and vectorizer saved successfully as .pkl files.")


Models and vectorizer saved successfully as .pkl files.


In [None]:
# Step 15: Load the models and vectorizer in a new environment
# This part simulates loading in a new script or notebook
print("\nLoading models and vectorizer in a simulated new environment...")
try:
    loaded_lr_model = joblib.load('logistic_regression_model.pkl')
    loaded_dt_model = joblib.load('decision_tree_model.pkl')
    loaded_gb_model = joblib.load('gradient_boosting_model.pkl')
    loaded_rf_model = joblib.load('random_forest_model.pkl')
    loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')
    print("Models and vectorizer loaded successfully.")
except FileNotFoundError:
    print("Make sure the .pkl model and vectorizer files are in the correct directory.")
    # Handle this error appropriately
    # exit()


Loading models and vectorizer in a simulated new environment...
Models and vectorizer loaded successfully.


In [None]:
# Step 16: Define functions for prediction with loaded models

def output_lable(n):
    if n == 0:
        return "Fake News"
    elif n == 1:
        return "Not A Fake News"

def manual_testing_loaded(news, vectorizer, lr_model, dt_model, gb_model, rf_model):
    testing_news = {"text": [news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test['text'] = new_def_test["text"].apply(wordopt) # Use the same wordopt function
    new_x_test = new_def_test["text"]
    new_xv_test = vectorizer.transform(new_x_test) # Use the loaded vectorizer

    pred_LR = lr_model.predict(new_xv_test)
    pred_DT = dt_model.predict(new_xv_test)
    pred_GB = gb_model.predict(new_xv_test)
    pred_RF = rf_model.predict(new_xv_test)

    print("\nLR Predicition: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction:{}".format(output_lable(pred_LR[0]),
                                                                                                             output_lable(pred_DT[0]),
                                                                                                             output_lable(pred_GB[0]),
                                                                                                             output_lable(pred_RF[0])))
    return pred_LR[0], pred_DT[0], pred_GB[0], pred_RF[0]

def final_verdict_loaded(pred_lr, pred_dt, pred_gb, pred_rf):
    predictions = [pred_lr, pred_dt, pred_gb, pred_rf]
    counts = {0: 0, 1: 0}
    for pred in predictions:
        counts[pred] += 1

    if counts[0] > counts[1]:
        return "Final Verdict: Fake News"
    else:
        return "Final Verdict: Not A Fake News"

print("Prediction functions defined.")

Prediction functions defined.


In [None]:
# Step 17: Test with new news article using loaded models
if 'loaded_vectorizer' in locals() and 'loaded_lr_model' in locals(): # Check if models were loaded
    news_article = str(input("Enter the news article you want to test with loaded models: "))
    lr_pred, dt_pred, gb_pred, rf_pred = manual_testing_loaded(news_article, loaded_vectorizer, loaded_lr_model, loaded_dt_model, loaded_gb_model, loaded_rf_model)
    final_result = final_verdict_loaded(lr_pred, dt_pred, gb_pred, rf_pred)
    print(final_result)
else:
    print("Models and vectorizer were not loaded. Please run the loading cell.")

Enter the news article you want to test with loaded models: Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and the very dishonest fake news media. The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year, President Angry Pants tweeted. 2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America! Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infanti

## Code Explanation

This notebook implements a fake news detection system using various machine learning models. Here's a breakdown of each step and the code blocks involved:

**Step 1: Import necessary libraries**

This cell imports all the required libraries for data manipulation, visualization, text preprocessing, model training, and saving/loading models.

- `pandas` for data handling and manipulation.
- `matplotlib.pyplot`, `numpy`, and `seaborn` for potential data visualization and numerical operations (though visualization is not explicitly used in the final prediction part).
- `string` and `re` for text preprocessing (removing punctuation, special characters, etc.).
- `sklearn` modules for:
    - `train_test_split`: Splitting data into training and testing sets.
    - `accuracy_score` and `classification_report`: Evaluating model performance.
    - `TfidfVectorizer`: Converting text data into numerical features using TF-IDF.
    - `LogisticRegression`, `DecisionTreeClassifier`, `GradientBoostingClassifier`, `RandomForestClassifier`: The machine learning models used for classification.
- `joblib`: For saving and loading trained machine learning models and the vectorizer.

**Step 2: Load the datasets**

This cell loads the "Fake.csv" and "True.csv" datasets into pandas DataFrames.

- `pd.read_csv()` is used to read the CSV files.
- `on_bad_lines='skip'` and `engine='python'` are used for the "Fake.csv" to handle potential issues with the file format.

**Step 3: Add class labels**

This cell adds a new column named 'class' to each DataFrame to represent the authenticity of the news.

- `data_fake["class"] = 0`: Assigns the label 0 to all rows in the `data_fake` DataFrame, indicating fake news.
- `data_true['class'] = 1`: Assigns the label 1 to all rows in the `data_true` DataFrame, indicating true news.

**Step 4: Manually test the last 10 rows and remove them from the main dataframes**

This cell separates the last 10 rows of each dataset for manual testing. This is a common practice to have a small, unseen dataset to verify the model's performance on new data after training.

- `data_fake.tail(10)` and `data_true.tail(10)` select the last 10 rows.
- `data_fake[:-10]` and `data_true[:-10]` select all rows except the last 10, effectively removing them from the main DataFrames.
- Class labels are also added to the manual testing DataFrames.

**Step 5: Merge the datasets**

This cell combines the fake and true news DataFrames into a single DataFrame.

- `pd.concat([data_fake, data_true], axis=0)` concatenates the two DataFrames row-wise (`axis=0`).

**Step 6: Drop unnecessary columns**

This cell removes columns that are not needed for the classification task.

- `data_merge.drop(['title', 'subject', 'date'], axis=1)` drops the 'title', 'subject', and 'date' columns. `axis=1` specifies that columns are being dropped.
- It also checks for any missing values in the resulting DataFrame.

**Step 7: Shuffle the data and reset index**

This cell shuffles the combined data and resets the index. Shuffling is important to ensure that the training and testing sets are representative of the overall data and not biased by the original order.

- `data.sample(frac=1)` randomly samples the entire DataFrame (`frac=1` means 100% of the data), effectively shuffling it.
- `.reset_index(drop=True)` resets the index to a default integer index and drops the old index.

**Step 8: Define text preprocessing function**

This cell defines a function `wordopt` to clean and preprocess the text data.

- Converts text to lowercase.
- Removes patterns like `[.*?\]`.
- Replaces non-word characters with spaces.
- Removes URLs.
- Removes HTML tags.
- Removes punctuation.
- Removes words containing digits.

**Step 9: Apply preprocessing to the text column**

This cell applies the `wordopt` function to the 'text' column of the DataFrame to preprocess the news articles.

- `data['text'].apply(wordopt)` applies the function to each element in the 'text' column.

**Step 10: Split data into training and testing sets**

This cell splits the preprocessed data into training and testing sets for model training and evaluation.

- `x` is assigned the 'text' column (features).
- `y` is assigned the 'class' column (labels).
- `train_test_split(x, y, test_size=0.25, random_state=42)` splits the data. `test_size=0.25` means 25% of the data will be used for testing, and `random_state=42` ensures reproducibility of the split.

**Step 11: Vectorize the text data using TfidfVectorizer**

This cell converts the text data into numerical features using the TF-IDF (Term Frequency-Inverse Document Frequency) vectorization method.

- `TfidfVectorizer()` initializes the vectorizer.
- `vectorization.fit_transform(x_train)` fits the vectorizer on the training data and transforms it into a TF-IDF matrix.
- `vectorization.transform(xv_test)` transforms the testing data using the same vectorizer fitted on the training data.

**Step 12: Train the models**

This cell trains four different machine learning models on the training data.

- `LogisticRegression()`, `DecisionTreeClassifier()`, `GradientBoostingClassifier()`, and `RandomForestClassifier()` initialize the models.
- `.fit(xv_train, y_train)` trains each model using the TF-IDF features (`xv_train`) and the corresponding labels (`y_train`).
- `random_state=42` is added to the tree-based models for reproducibility.

**Step 13: Evaluate the models (Optional, already done in original notebook)**

This cell evaluates the trained models on the testing data.

- `.score(xv_test, y_test)` calculates the accuracy of each model on the testing set.

**Step 14: Save the models and vectorizer**

This cell saves the trained models and the TF-IDF vectorizer to disk using `joblib`. This allows you to load and use the trained models later without retraining them.

- `joblib.dump(model, 'filename.pkl')` saves the specified model or vectorizer to a pickle file.

**Step 15: Load the models and vectorizer in a new environment**

This cell simulates loading the saved models and vectorizer in a new environment (e.g., a separate script for prediction).

- `joblib.load('filename.pkl')` loads the saved model or vectorizer from the pickle file.
- A `try-except` block is used to handle potential `FileNotFoundError` if the files are not found.

**Step 16: Define functions for prediction with loaded models**

This cell defines two functions for making predictions on new news articles using the loaded models.

- `output_lable(n)`: A helper function that returns "Fake News" if the prediction is 0 and "Not A Fake News" if the prediction is 1.
- `manual_testing_loaded(news, vectorizer, lr_model, dt_model, gb_model, rf_model)`:
    - Takes a news article string and the loaded vectorizer and models as input.
    - Creates a pandas DataFrame from the input news article.
    - Applies the `wordopt` preprocessing function to the news article.
    - Transforms the preprocessed news article using the loaded vectorizer.
    - Makes predictions using each of the loaded models.
    - Prints the predictions from each model.
    - Returns the individual model predictions.
- `final_verdict_loaded(pred_lr, pred_dt, pred_gb, pred_rf)`:
    - Takes the individual model predictions as input.
    - Counts the number of "Fake News" (0) and "Not A Fake News" (1) predictions.
    - Returns a "Final Verdict" based on the majority vote of the models.

**Step 17: Test with new news article using loaded models**

This cell demonstrates how to use the loaded models to predict whether a new news article is fake or true.

- It checks if the models and vectorizer were successfully loaded.
- It prompts the user to enter a news article.
- It calls the `manual_testing_loaded` function to get predictions from each model.
- It calls the `final_verdict_loaded` function to get the final verdict based on the majority vote.
- It prints the final verdict.