# Model used for Comparison

### Data Preparation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load Dataset
true_data = pd.read_csv('gossipcop_real.csv')
fake_data = pd.read_csv('gossipcop_fake.csv')

# Generate labels True/Fake under new Target Column in 'true_data' and 'fake_data'
true_data['Target'] = ['True']*len(true_data)
fake_data['Target'] = ['Fake']*len(fake_data)

# Merge 'true_data' and 'fake_data', by random mixing into a single df called 'data'
fake_news_data = pd.concat([true_data, fake_data]).sample(frac=1).reset_index(drop=True)

# Preprocess the text data using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X = vectorizer.fit_transform(fake_news_data['title']).toarray()
y = fake_news_data['Target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### View Data

In [2]:
print(fake_news_data.shape)
fake_news_data.head()

(22140, 5)


Unnamed: 0,id,news_url,title,tweet_ids,Target
0,gossipcop-896820,https://www.businessinsider.com/richest-victor...,These are the 9 highest-paid Victoria's Secret...,935653397839130624\t935658070172151808\t935658...,True
1,gossipcop-914130,https://www.usmagazine.com/celebrity-news/news...,Inside Amy Schumer’s ‘Very Laid-Back’ Wedding ...,964237928636403712\t964238809016696838\t964238...,True
2,gossipcop-939986,https://www.townandcountrymag.com/society/trad...,Queen Elizabeth Keeps a Never-Before-Seen Phot...,1001863593636302850\t1001863633805238272\t1001...,True
3,gossipcop-908586,https://www.thisisinsider.com/rihanna-fashion-...,40 photos that show how Rihanna's style has ev...,956173097723449344\t956173808863469568\t956174...,True
4,gossipcop-851934,https://www.denofgeek.com/us/tv/law-order-true...,Law & Order True Crime: The Menendez Murders T...,863962967729164289\t863963158548971520\t863963...,True


## Decision Tree

In [3]:
# Train the Decision Tree Classifier
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, y_train)

# Make predictions
y_pred = decision_tree_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.7940379403794038
Precision: 0.7886498800623719
Recall: 0.7940379403794038
F1 Score: 0.7910336231757119


## Naive Bayes

In [4]:
# Train the Naive Bayes Classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

# Make predictions
y_pred = naive_bayes_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8407859078590786
Precision: 0.8366952976160756
Recall: 0.8407859078590786
F1 Score: 0.8210536921244451


## Logistic Regression

In [5]:
# Train the Logistic Regression Classifier
logistic_regression_classifier = LogisticRegression(max_iter=1000)
logistic_regression_classifier.fit(X_train, y_train)

# Make predictions
y_pred = logistic_regression_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8443992773261066
Precision: 0.8367266206580636
Recall: 0.8443992773261066
F1 Score: 0.8304945792841278


## LightGBM

In [6]:
# Train the LightGBM Classifier
lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train, y_train)

# Make predictions
y_pred = lgbm_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[LightGBM] [Info] Number of positive: 13427, number of negative: 4285
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38117
[LightGBM] [Info] Number of data points in the train set: 17712, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.758074 -> initscore=1.142147
[LightGBM] [Info] Start training from score 1.142147
Accuracy: 0.8446251129177959
Precision: 0.8376406156735272
Recall: 0.8446251129177959
F1 Score: 0.8297441206854065


## K-Nearest Neighbour 

In [7]:
# Train the K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors
knn_classifier.fit(X_train, y_train)

# Make predictions
y_pred = knn_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.8139114724480578
Precision: 0.7984096414864092
Recall: 0.8139114724480578
F1 Score: 0.7960888887278706


## Support Vector Machines

In [8]:
# Train the Support Vector Machines Classifier
svm_classifier = SVC(kernel='linear')  # You can try different kernels like 'rbf', 'poly', etc.
svm_classifier.fit(X_train, y_train)

# Make predictions
y_pred = svm_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8410117434507678
Precision: 0.8326968391539261
Recall: 0.8410117434507678
F1 Score: 0.8263713016842891


## Random Forest 

In [None]:
# Train the Random Forest Classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100)  # You can adjust the number of estimators
random_forest_classifier.fit(X_train, y_train)

# Make predictions
y_pred = random_forest_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)