In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('./Data/train.csv')
# df.head(5)

In [None]:
df['content'] = df['title']+' ' + df['text']

In [None]:
df.drop(columns=['author','title','text','id'],inplace=True)

In [None]:
df.isna().sum()

In [None]:
sample_df = df.iloc[:1000,:]

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from wordcloud import WordCloud
from spacy import displacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Drop rows with NaN values in 'content' column
sample_df = sample_df.dropna(subset=['content'])

# Tokenization and Lemmatization using spaCy
sample_df['tokenized_content'] = sample_df['content'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

# Split the DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sample_df['tokenized_content'], sample_df['label'], test_size=0.20, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# print(f"Accuracy: {accuracy}")
# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
# print(f"F1 Score: {f1}")
# print(f"Confusion Matrix:\n{conf_matrix}")

# # Confusion Matrix Visualization
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
# plt.title("Confusion Matrix")
# plt.xlabel("Predicted")
# plt.ylabel("True")
# plt.show()

# # Word Cloud for Fake News
# fake_news_words = " ".join(sample_df[sample_df['label'] == 1]['tokenized_content'])
# wordcloud = WordCloud(width=800, height=400, max_words=150, background_color='white').generate(fake_news_words)

# plt.figure(figsize=(10, 8))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.title('Word Cloud for Fake News')
# plt.show()

In [None]:
from sklearn.metrics import accuracy_score,precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

svc = SVC(kernel='sigmoid',gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear',penalty='l1')
rfc = RandomForestClassifier(n_estimators=50,random_state=2)
abc = AdaBoostClassifier(n_estimators=50,random_state=2)
bc = BaggingClassifier(n_estimators=50,random_state=2)
etc = ExtraTreesClassifier(n_estimators=50,random_state=2)
gdbt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

clfs = {
    'SVC' : svc,
    'KN':knc,
    'NB':mnb,
    'DT':dtc,
    'LR':lrc,
    'RF':rfc,
    'AdaBoost':abc,
    'Bgc':bc,
    'ETC':etc,
    'GBDT':gdbt,
    'xgb' : xgb
}

def train_classifier(clf,x_train,y_train,x_test,y_test):
   clf.fit(x_train,y_train)
   y_pred = clf.predict(x_test)
   # Performance Metrics
   accuracy = accuracy_score(y_test,y_pred)
   precision = precision_score(y_test,y_pred)
   recall = recall_score(y_test,y_pred)
   f1 = f1_score(y_test,y_pred)
   conf_matrix = confusion_matrix(y_test,y_pred)

   return accuracy,precision,recall,f1,conf_matrix

accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
conf_matrix_scores = []
for name,clf in clfs.items():
    current_accuracy,current_precision,current_recall,current_f1,current_conf_matrix = train_classifier(clf,X_train_tfidf,y_train,X_test_tfidf
,y_test)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)
    recall_scores.append(current_recall)
    f1_scores.append(current_f1)
    conf_matrix_scores.append(current_conf_matrix)

performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,
'Precision':precision_scores,'Recall':recall_scores,'F1':f1_scores,'conf_matrix':conf_matrix_scores}).sort_values('Precision',ascending=False)
performance_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Data
data = {
    'Algorithm': ['NB', 'ETC', 'AdaBoost', 'GBDT', 'Bgc', 'xgb', 'DT', 'RF', 'SVC', 'LR', 'KN'],
    'Accuracy': [0.692308, 0.861538, 0.943590, 0.943590, 0.923077, 0.933333, 0.887179, 0.841026, 0.851282, 0.830769, 0.646154],
    'Precision': [1.000000, 0.946667, 0.936842, 0.936842, 0.934066, 0.926316, 0.860000, 0.846154, 0.835052, 0.790476, 0.588652],
    'Recall': [0.361702, 0.755319, 0.946809, 0.946809, 0.904255, 0.936170, 0.914894, 0.819149, 0.861702, 0.882979, 0.882979],
    'F1': [0.531250, 0.840237, 0.941799, 0.941799, 0.918919, 0.931217, 0.886598, 0.832432, 0.848168, 0.834171, 0.706383]
}

df = pd.DataFrame(data)

# Set the Algorithm column as the index for better visualization
df.set_index('Algorithm', inplace=True)

# Plotting
plt.figure(figsize=(12, 8))

# Accuracy
plt.subplot(2, 2, 1)
sns.barplot(x=df['Accuracy'], y=df.index, palette='viridis')
plt.title('Accuracy')

# Precision
plt.subplot(2, 2, 2)
sns.barplot(x=df['Precision'], y=df.index, palette='viridis')
plt.title('Precision')

# Recall
plt.subplot(2, 2, 3)
sns.barplot(x=df['Recall'], y=df.index, palette='viridis')
plt.title('Recall')

# F1 Score
plt.subplot(2, 2, 4)
sns.barplot(x=df['F1'], y=df.index, palette='viridis')
plt.title('F1 Score')

plt.tight_layout()
plt.show()

# Heatmap for Confusion Matrices
conf_matrices = [
    [[101, 0], [60, 34]],
    [[97, 4], [23, 71]],
    [[95, 6], [5, 89]],
    [[95, 6], [5, 89]],
    [[95, 6], [9, 85]],
    [[94, 7], [6, 88]],
    [[87, 14], [8, 86]],
    [[87, 14], [17, 77]],
    [[85, 16], [13, 81]],
    [[79, 22], [11, 83]],
    [[43, 58], [11, 83]]
]

plt.figure(figsize=(15, 10))
for i in range(len(conf_matrices)):
    plt.subplot(3, 4, i + 1)
    sns.heatmap(conf_matrices[i], annot=True, fmt="d", cmap="Blues", cbar=False, linewidths=.5)
    plt.title(f'Confusion Matrix - {df.index[i]}')

plt.tight_layout()
plt.show()
