Importing Libraries and Mounting on Google Drive

In [None]:
# Importing required libraries
import os
import pandas as pd
from google.colab import drive

# Mounting Google Drive to access files
drive.mount('/content/drive')

Loading the Dataset

In [None]:
# Defining the path to the dataset
data_root = ('/content/drive/MyDrive/Colab Notebooks/Project 4')  # This path will need to be modified based on the dataset's location
dataset_path = os.path.join(data_root, 'Data_fake_or_real_news.csv')

# Reading the dataset into a pandas DataFrame
df = pd.read_csv(dataset_path)
df

In [None]:
# Checking the distribution of labels (real or fake)
df.label.value_counts()

In [None]:
# Dropping irrelevant columns that won't help in classification
df.drop(['id', 'title'], axis=1, inplace=True)

In [None]:
df

<h3>Train test split</h3>

In [None]:
# Splitting the dataset into training and testing sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.2, random_state=25)

In [None]:
X_test

In [None]:
X_train

In [None]:
y_train

In [None]:
# Checking data structure
X_train.values

In [None]:
X_train.values.shape

In [None]:
type(X_train.values)

# Bag of Words

In [None]:
# Initializing CountVectorizer to convert text to bag-of-words representation
from sklearn.feature_extraction.text import CountVectorizer
countV = CountVectorizer()

In [None]:
v = CountVectorizer()

# Fitting and transforming training data into vector
X_train_cv = v.fit_transform(X_train)
X_train_cv

In [None]:
# Converting sparse matrix to array
X_train_cv.toarray()

In [None]:
# Viewing the vector representation of the first training sample
X_train_cv.shape

In [None]:
X_train_cv[0].toarray()   # Number of columns = number of unique words

In [None]:
# Getting the vocabulary list
v.get_feature_names_out()

In [None]:
# Creating a DataFrame from the vectorized training data
Train_df=pd.DataFrame(X_train_cv.toarray(), columns=v.get_feature_names_out())
Train_df

In [None]:
# Setting display option to show all columns in DataFrame
pd.set_option('display.max_columns', None)

In [None]:
Train_df

In [None]:
# Viewing vocabulary mapping (word to index)
v.vocabulary_

<h3>Train the naive bayes model</h3>

In [None]:
y_train

In [None]:
# Initializing and training the Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(Train_df, y_train)

In [None]:
X_test    # Need to convert the test data to bag of words also

<h3>Evaluate Performance</h3>

In [None]:
# Vectorizing the test data using the same vocabulary
X_test_cv = v.transform(X_test)

In [None]:
X_test_cv.toarray()

In [None]:
X_test_cv.toarray().shape

In [None]:
# Evaluating the model's accuracy on the test data
model.score(X_test_cv, y_test)

In [None]:
# Making predictions and printing a performance report
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))