In [None]:
import pandas as pd

# Load the datasets
fake_df = pd.read_csv('/content/drive/MyDrive/BDA_DATASET_NEW/fake.csv')
true_df = pd.read_csv('/content/drive/MyDrive/BDA_DATASET_NEW/true.csv')

# Add a 'label' column to both datasets
fake_df['label'] = 0  # 0 for Fake news
true_df['label'] = 1  # 1 for Real news

# Combine the datasets
df = pd.concat([fake_df, true_df]).reset_index(drop=True)
# reset index it used to change the index number from 0 to n-1

# Shuffle the dataset to mix fake and real news
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the structure of the dataset
print(df.head())


                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1  
3         May 22, 2017      0  
4       June 24, 2016       1  


In [None]:
from sklearn.model_selection import train_test_split

# Check columns and prepare the text data
df['content'] = df['title'].astype(str) + ' ' + df['text'].astype(str)  # Combining title and text
X = df['content'] #feature matrix which contains all the inputs
y = df['label']   #target variable which contains the labels (0 and 1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#This ensures that the shuffling is reproducible (i.e.we  get the same random shuffle every time we run the code).


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#This will convert the text data into numerical form using the TF-IDF method (Term Frequency-Inverse Document Frequency).

from sklearn.linear_model import LogisticRegression
#This will be the model used to classify the news articles as fake or real.

from sklearn.pipeline import make_pipeline
#A utility function to create a pipeline that chains together multiple steps, making the workflow cleaner.

from sklearn.metrics import accuracy_score, classification_report
#These functions are used to evaluate the model’s performance.

# Define the pipeline
model_pipeline = make_pipeline(TfidfVectorizer(max_features=5000), LogisticRegression(max_iter=1000))

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = model_pipeline.predict(X_test)
# The model predicts the labels for the test data.

accuracy = accuracy_score(y_test, y_pred)
 #The accuracy of the model is calculated by comparing the predicted labels (y_pred) to the true labels (y_test).

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9863028953229399
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      4710
           1       0.98      0.99      0.99      4270

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [None]:
import pickle

# Save the trained model
model_path = 'model_pipeline.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model_pipeline, file)
print(f"Model saved to {model_path}")


Model saved to model_pipeline.pkl
