In [None]:
#importing the libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [None]:
#uploading the data so that we could load it into colab
from google.colab import files
uploaded = files.upload()




Saving IFND.csv to IFND (1).csv


In [None]:
#reading the data
import io
df = pd.read_csv(io.BytesIO(uploaded['IFND.csv']), encoding='latin1')

In [None]:
df.head()

Exploratory Data Analysis

In [None]:
#checking for any missing values
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
#Date column has missing values and hence dropping the missing values also dropping unwanted columns
df.dropna(subset=['Date'], inplace=True)
df=df.drop(['id', 'Image','Web','Date'], axis=1)

In [None]:
#checking the total count of true and fake news to check for any class imbalance
label_counts = df['Label'].value_counts()
label_counts

PRE-PROCESSING

In [None]:
#Importing the libraries need for Pre-processing
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
#lowering all the text in 'statement' column
df['Statement'] = df['Statement'].str.lower()

In [None]:
#Statement column fake news has a string 'Fact Check' removing that using regular expression as it may cause bias.
df['Statement'] = df['Statement'].apply(lambda x: re.sub(r'\bfact check\b', '', x))


In [None]:
#Removing the punctuations in the statement column
df['Statement'] = df['Statement'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [None]:
#Downloadind the required packages for the library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
#Removing stopwords
stop_words = set(stopwords.words('english'))
df['Statement'] = df['Statement'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [None]:
#Performing Lemmatization on the statement column
lemmatizer = WordNetLemmatizer()
df['Statement'] = df['Statement'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [None]:
#Performing Tokenization on the statement column
df['Tokenized'] = df['Statement'].apply(word_tokenize)

In [None]:
#Performing a Wordcloud on statement column to see most repeated words as a visual
from wordcloud import WordCloud

In [None]:
text = ' '.join(df['Statement'])

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

In [None]:
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#Visualizing the Category column to see spread of data among the categories
category_counts = df['Category'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Performing One-Hot Encoding on Category and Label column to deal with categorical values
category_dummies = pd.get_dummies(df['Category'], prefix='Category')
df = pd.concat([df, category_dummies], axis=1)

In [None]:
label_dummies = pd.get_dummies(df['Label'], prefix='Label')
df = pd.concat([df, label_dummies], axis=1)

In [None]:
df.drop(['Category', 'Label'], axis=1, inplace=True)
df.tail(3)

FEATURE EXTRACTION using TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = vectorizer.fit_transform(df['Statement'])

# Converting to sparse representation
tfidf_features_sparse = csr_matrix(tfidf_features)

# Spliting the data into train and test sets
X = tfidf_features_sparse
y = df['Label_Fake']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature selection


In [None]:
from sklearn.feature_selection import SelectKBest, chi2
k = 1000  # Select top k features
selector = SelectKBest(chi2, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [None]:
#Performing Sentimental Analysis using VADER
nltk.download('vader_lexicon')

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer



In [None]:
sentimentanalyser = SentimentIntensityAnalyzer()


In [None]:
for index, row in df.iterrows():
    statement = row['Statement']
    sentiment_score = sentimentanalyser.polarity_scores(statement)
    df.at[index, 'SentimentScore'] = sentiment_score['compound']
df.tail()

In [None]:
#Checking if length of the statement contributes to whether its positive or negative
df['Statement_Length'] = df['Statement'].apply(len)

In [None]:
correlation = df['Statement_Length'].corr(df['SentimentScore'])

In [None]:
print("Correlation:", correlation)

In [None]:
# Calculating Emotional Ratio
df['Emotional_Ratio'] = df['SentimentScore'].apply(lambda x: max(0, x))

# Calculating Positive Ratio
df['Positive_Ratio'] = df['SentimentScore'].apply(lambda x: (max(0, x) / df['SentimentScore'].max()) if df['SentimentScore'].max() != 0 else 0)

# Calculating Negative Ratio
df['Negative_Ratio'] = df['SentimentScore'].apply(lambda x: (max(0, -x) / df['SentimentScore'].max()) if df['SentimentScore'].min() != 0 else 0)


In [None]:
df.head(6)

DATA MODELLING

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

In [None]:
# Training the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_reg = accuracy_score(y_test, y_pred)
precision_reg = precision_score(y_test, y_pred)
recall_reg = recall_score(y_test, y_pred)
f1_score_reg = f1_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print("Logistic Regression Results: ")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f'Accuracy: {accuracy_reg:.4f}')
print(f'Precision: {precision_reg:.4f}')
print(f'Recall: {recall_reg:.4f}')
print(f'F1_score: {f1_score_reg:.4f}')

In [None]:
#Naive Bayes
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = tfidf_features_sparse
y = df['Label_Fake']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

In [None]:
y_pred_nb = naive_bayes_model.predict(X_test)

In [None]:
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_score_nb = f1_score(y_test, y_pred_nb)

In [None]:
print("Naive Bayes Results:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print("Accuracy:", accuracy_nb)
print("Precision:", precision_nb)
print("Recall:", recall_nb)
print("F1 Score:", f1_score_nb)

In [None]:
#SVM
from sklearn.svm import SVC

# Train the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)

# Predict on test set using SVM
y_pred_svm = svm_model.predict(X_test)

# Evaluate SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_score_svm = f1_score(y_test, y_pred_svm)

print("Support Vector Machine Results:")
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred_svm))
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_score_svm)

HYPER-PARAMETER TUNING OF SVM

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 1, 'scale']
}

# Create the SVM model
svm_model = SVC()

# Perform grid search cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best SVM model
best_svm_model = grid_search.best_estimator_

# Predict on test set using the best model
y_pred_svm = best_svm_model.predict(X_test)

# Evaluate the best SVM model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_score_svm = f1_score(y_test, y_pred_svm)

print("Support Vector Machine Results (with Hyperparameter Tuning):")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1 Score:", f1_score_svm)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


KeyboardInterrupt: ignored