# Team Members

# Class Imbalance 2: Fake News Classification

# Load Required Libraries

In [4]:
import numpy as np
import pandas as pd
import re
from imblearn.over_sampling import RandomOverSampler
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Data Cleaning

### Read data and drop columns

In [5]:
df = pd.read_csv("fake.news.train.subset.csv")

In [6]:
df.dropna(inplace=True)
df.drop(columns=['id', 'tid1', 'tid2', 'title1_zh', 'title2_zh'], inplace=True) 
# remove ids and chinese (zhuang) titles from  data

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    return text

In [8]:
df['title1_en'] = df['title1_en'].apply(clean_text)
df['title2_en'] = df['title2_en'].apply(clean_text)

In [9]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])

In [10]:
df['label'].value_counts()

label
2    13149
0     6326
1      525
Name: count, dtype: int64

### Create sparse matrix from title data

In [11]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['title1_en'] + " " + df['title2_en'])  # Combine both titles

In [12]:
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

### Random Over Sampler

In [13]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [14]:
unique_values, counts = np.unique(y_resampled, return_counts=True)
print(unique_values)
print(counts)

[0 1 2]
[13149 13149 13149]


### Class Weighting

In [17]:
class_distribution = df['label'].value_counts()

# Calculate inverse frequency weights
total_samples = class_distribution.sum()
class_weights = total_samples / (len(class_distribution) * class_distribution)

class_weights = class_weights.to_dict()

print("Class Weights:")
print(class_weights)

Class Weights:
{2: 0.5070094050244632, 0: 1.0538518284329224, 1: 12.698412698412698}


### One-Class learning

In [22]:
class_label = 0
class_df = df[df['label'] == class_label]

X_class = class_df.drop('label', axis=1)
y_class = class_df['label']

X_train_1c, X_test_1c, y_train_1c, y_test_1c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

unique_values_set = set(y_train_1c)
for value in unique_values_set:
    print(value)

0


### Feature selection

In [25]:
selector = SelectKBest(score_func=f_classif, k=5)
X_resampled_selected = selector.fit_transform(X_resampled, y_resampled)
selected_feature_indices = selector.get_support(indices=True)
X_selected = X[:, selected_feature_indices]

### Create Train Test Split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

In [27]:
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_resampled_selected, y_resampled, test_size=0.3, random_state=42)

### Store cleaned dataset

In [13]:
dense_matrix = X_selected.toarray()
data = pd.DataFrame(dense_matrix, columns=[f'feature_{i}' for i in range(dense_matrix.shape[1])])
data['label'] = y
data.to_csv('fake.news.cleaned.csv', index=False)

# Model Training

## K-Nearest Neigbours

### with class imbalance

In [62]:
knn_classifier = KNeighborsClassifier()

In [63]:
knn_classifier.fit(X_train, y_train)

In [64]:
y_pred1 = knn_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred1))
print(classification_report(y_test, y_pred1))

Accuracy: 0.6533333333333333
              precision    recall  f1-score   support

           0       0.47      0.02      0.04      1926
           1       0.43      0.09      0.15       160
           2       0.66      0.99      0.79      3914

    accuracy                           0.65      6000
   macro avg       0.52      0.37      0.33      6000
weighted avg       0.59      0.65      0.53      6000



In [65]:
cv_scores = cross_val_score(knn_classifier, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.66071429 0.45214286 0.66214286 0.43928571 0.44714286]
Mean CV Score: 0.5322857142857143


### without class imbalance

In [66]:
knn_classifier.fit(X_train_rs, y_train_rs)

In [67]:
y_pred2_rs = knn_classifier.predict(X_test_rs)
print("Accuracy:", accuracy_score(y_test_rs, y_pred2_rs))
print(classification_report(y_test_rs, y_pred2_rs))

Accuracy: 0.5471060414026193
              precision    recall  f1-score   support

           0       0.65      0.02      0.05      3875
           1       0.91      0.67      0.77      3961
           2       0.42      0.93      0.58      3999

    accuracy                           0.55     11835
   macro avg       0.66      0.54      0.47     11835
weighted avg       0.66      0.55      0.47     11835



In [68]:
cv_scores = cross_val_score(knn_classifier, X_train_rs, y_train_rs, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.38312511 0.38565997 0.38409996 0.59706628 0.59815284]
Mean CV Score: 0.4696208335718735


The precesion for minority class has improved but as the performance overall is very poor we will move to another algorithm

## Logistic Regression

### with class imbalance

In [28]:
lg_classifier = LogisticRegression()
lg_classifier.fit(X_train, y_train)

In [29]:
y_pred5 = lg_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred5))
print(classification_report(y_test, y_pred5))

Accuracy: 0.6528333333333334
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1926
           1       0.80      0.03      0.05       160
           2       0.65      1.00      0.79      3914

    accuracy                           0.65      6000
   macro avg       0.48      0.34      0.28      6000
weighted avg       0.45      0.65      0.52      6000



In [30]:
cv_scores = cross_val_score(lg_classifier, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.65964286 0.65964286 0.66035714 0.65964286 0.65928571]
Mean CV Score: 0.6597142857142857


Although this is a high accuracy if we glance at the precesion for first class we see that it is at 0%. This means this model is not fit for classifying the given dataset.

### without class imbalance

In [75]:
lg_classifier.fit(X_train_rs, y_train_rs)

In [76]:
y_pred6_rs = lg_classifier.predict(X_test_rs)
print("Accuracy:", accuracy_score(y_test_rs, y_pred6_rs))
print(classification_report(y_test_rs, y_pred6_rs))

Accuracy: 0.5310519645120405
              precision    recall  f1-score   support

           0       0.45      0.96      0.62      3875
           1       0.73      0.63      0.68      3961
           2       0.32      0.02      0.04      3999

    accuracy                           0.53     11835
   macro avg       0.50      0.54      0.44     11835
weighted avg       0.50      0.53      0.44     11835



In [79]:
cv_scores = cross_val_score(lg_classifier, X_train_rs, y_train_rs, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.53575955 0.54463154 0.53748642 0.53603767 0.53477001]
Mean CV Score: 0.5377370376279682


This model is worse than Random Forest

### without class imbalance Class Weighting

In [None]:
lg_classifier2 = LogisticRegression(class_weight=class_weights)
lg_classifier2.fit(X_train, y_train)

In [None]:
y_pred4 = lg_classifier2.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred4))
print(classification_report(y_test, y_pred4))

In [None]:
cv_scores = cross_val_score(lg_classifier2, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

## Naive Bayes

### with class imbalance

In [81]:
nb_classifier = GaussianNB()
X_train_nb = X_train.toarray()
nb_classifier.fit(X_train_nb, y_train)

In [82]:
y_pred7 = nb_classifier.predict(X_test.toarray())
print("Accuracy:", accuracy_score(y_test, y_pred7))
print(classification_report(y_test, y_pred7))

Accuracy: 0.4145
              precision    recall  f1-score   support

           0       0.37      0.99      0.53      1926
           1       0.22      0.28      0.24       160
           2       0.90      0.14      0.24      3914

    accuracy                           0.41      6000
   macro avg       0.49      0.47      0.34      6000
weighted avg       0.71      0.41      0.34      6000



In [83]:
cv_scores = cross_val_score(nb_classifier, X_train_nb, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.40642857 0.41464286 0.40464286 0.40571429 0.41214286]
Mean CV Score: 0.4087142857142857


### without class imbalance

In [84]:
X_train_rs_nb = X_train_rs.toarray()
nb_classifier.fit(X_train_rs_nb, y_train_rs)

In [85]:
y_pred8_rs = nb_classifier.predict(X_test_rs.toarray())
print("Accuracy:", accuracy_score(y_test_rs, y_pred8_rs))
print(classification_report(y_test_rs, y_pred8_rs))

Accuracy: 0.4768905787917195
              precision    recall  f1-score   support

           0       0.44      0.97      0.61      3875
           1       0.78      0.35      0.49      3961
           2       0.31      0.12      0.17      3999

    accuracy                           0.48     11835
   macro avg       0.51      0.48      0.42     11835
weighted avg       0.51      0.48      0.42     11835



In [86]:
cv_scores = cross_val_score(nb_classifier, X_train_rs_nb, y_train_rs, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.48415716 0.48759732 0.47519015 0.48460703 0.48116624]
Mean CV Score: 0.4825435800622507


CV did improve the models accuracy a bit but this is still not a usable accuracy (between 80-90%)

## Random Forest

### with class imbalance

In [87]:
rf_classifier = RandomForestClassifier()

In [88]:
rf_classifier.fit(X_train, y_train)

In [89]:
y_pred3 = rf_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred3))
print(classification_report(y_test, y_pred3))

Accuracy: 0.6461666666666667
              precision    recall  f1-score   support

           0       0.41      0.02      0.03      1926
           1       0.23      0.11      0.15       160
           2       0.65      0.98      0.78      3914

    accuracy                           0.65      6000
   macro avg       0.43      0.37      0.32      6000
weighted avg       0.56      0.65      0.53      6000



In [90]:
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.64857143 0.65178571 0.65607143 0.65035714 0.64892857]
Mean CV Score: 0.6511428571428571


### without class imbalance

In [91]:
rf_classifier.fit(X_train_rs, y_train_rs)

In [92]:
y_pred4_rs = rf_classifier.predict(X_test_rs)
print("Accuracy:", accuracy_score(y_test_rs, y_pred4_rs))
print(classification_report(y_test_rs, y_pred4_rs))

Accuracy: 0.6108153781157584
              precision    recall  f1-score   support

           0       0.46      0.99      0.63      3875
           1       0.96      0.67      0.79      3961
           2       0.96      0.18      0.30      3999

    accuracy                           0.61     11835
   macro avg       0.80      0.61      0.58     11835
weighted avg       0.80      0.61      0.57     11835



In [93]:
cv_scores = cross_val_score(rf_classifier, X_train_rs, y_train_rs, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.61977186 0.61796125 0.61861644 0.61336472 0.61680551]
Mean CV Score: 0.6173039575111894


The accuracy has improved a bit but we can do better

## Xgboost

### with class imbalance

In [94]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

In [95]:
y_pred9 = xgb_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred9))
print(classification_report(y_test, y_pred9))

Accuracy: 0.6516666666666666
              precision    recall  f1-score   support

           0       0.43      0.01      0.03      1926
           1       0.36      0.05      0.09       160
           2       0.66      0.99      0.79      3914

    accuracy                           0.65      6000
   macro avg       0.48      0.35      0.30      6000
weighted avg       0.58      0.65      0.53      6000



In [96]:
cv_scores = cross_val_score(xgb_classifier, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.66035714 0.66142857 0.65821429 0.65714286 0.65642857]
Mean CV Score: 0.6587142857142857


### without class imbalance

In [97]:
xgb_classifier.fit(X_train_rs, y_train_rs)

In [98]:
y_pred10_rs = xgb_classifier.predict(X_test_rs)
print("Accuracy:", accuracy_score(y_test_rs, y_pred10_rs))
print(classification_report(y_test_rs, y_pred10_rs))

Accuracy: 0.5826784959864808
              precision    recall  f1-score   support

           0       0.46      0.98      0.62      3875
           1       0.86      0.67      0.76      3961
           2       0.90      0.11      0.20      3999

    accuracy                           0.58     11835
   macro avg       0.74      0.59      0.53     11835
weighted avg       0.74      0.58      0.52     11835



In [99]:
cv_scores = cross_val_score(xgb_classifier, X_train_rs, y_train_rs, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

Cross-Validation Scores: [0.59206953 0.59550969 0.59561753 0.59471206 0.59380659]
Mean CV Score: 0.5943430793475482


Here we notice that with over samlping the accuracy decreaases, but since it is an ensamble method it handles class imbalance very well on its own so its better to avoid handling class imbalance separatly.

Based on accuracy scores alone, the best accuracy was provided by XGBoost. But for saving time and computation costs the original dataset was trimed to a subset. Further analysis with a larger dataset can result in differences but for now XGBoost prevails.