In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


In [90]:
# Load the datasets
train_data = pd.read_csv("/content/train.csv")


In [91]:
print(train_data.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [92]:
# Check for missing values
print(train_data.isnull().sum())

# Remove rows with missing values
shuffled_data = train_data.dropna().reset_index(drop=True)

# Replace NaN values in the 'text' column with an empty string
shuffled_data['text'].fillna('', inplace=True)


id           0
title      558
author    1957
text        39
label        0
dtype: int64


In [93]:
# Assuming 'text' is the column containing the text data
X = shuffled_data['text']
y = shuffled_data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# SVM

In [94]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
y_train_pred = svm_model.predict(X_train_tfidf)

train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

# Make predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

import pickle
filename = 'SVM_model.sav'
pickle.dump(svm_model, open('/content/'+filename, 'wb'))


Training Accuracy: 0.98
Test Accuracy: 0.95


# NAIVE BAYES

In [95]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_train_pred_nb = nb.predict(X_train_tfidf)

train_accuracy_nb = accuracy_score(y_train, y_train_pred_nb)
print(f"Training Accuracy (Naive Bayes): {train_accuracy_nb:.2f}")

y_pred_nb = nb.predict(X_test_tfidf)

test_accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Test Accuracy (Naive Bayes): {test_accuracy_nb:.2f}")

precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

print(f"Precision (Test): {precision_nb:.2f}")
print(f"Recall (Test): {recall_nb:.2f}")
print(f"F1 Score (Test): {f1_nb:.2f}")

filename = 'NaiveBayeModel.sav'
pickle.dump(nb, open('/content/'+filename, 'wb'))

Training Accuracy (Naive Bayes): 0.90
Test Accuracy (Naive Bayes): 0.89
Precision (Test): 0.89
Recall (Test): 0.89
F1 Score (Test): 0.89
