# Cleaning

In [26]:
import os
import pandas as pd
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from langdetect import detect

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\draxe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
run_mode = ''
if run_mode == 'kaggle':
    train = pd.read_csv('/kaggle/input/fake-news/train.csv')
    test = pd.read_csv('/kaggle/input/fake-news/test.csv')
else: # local
    train = pd.read_csv('../../Dataset_Original/UTK/train.csv')
    test = pd.read_csv('../../Dataset_Original/UTK/test.csv')
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


Remove the author column

In [28]:
train.drop(columns=['author'], inplace=True)
train.head()

Unnamed: 0,id,title,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Print \r\nAn Iranian woman has been sentenced ...,1


In [29]:
train.head()

Unnamed: 0,id,title,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Print \r\nAn Iranian woman has been sentenced ...,1


Null values

In [30]:
print(train['label'].value_counts())

label
1    10413
0    10387
Name: count, dtype: int64


In [31]:
train.isnull().sum()

id         0
title    558
text      39
label      0
dtype: int64

In [32]:
train = train.dropna(subset=['text'])

Outliers

In [33]:
train[train['text'].str.len() <= 2].head()

Unnamed: 0,id,title,text,label
82,82,Huma’s Weiner Dogs Hillary,,1
169,169,Mohamad Khweis: Another “Virginia Man” (Palest...,,1
295,295,A Connecticut Reader Reports Record Voter Regi...,,1
470,470,BULLETIN: There ARE Righteous Jews For Trump!;...,,1
592,592,Is your promising internet career over now Vin...,,1


In [34]:
train = train[train['text'].str.len() > 2]

In [35]:
train[train['text'] == "source Add To The Conversation Using Facebook Comments"]['text']

519      source Add To The Conversation Using Facebook ...
3206     source Add To The Conversation Using Facebook ...
4726     source Add To The Conversation Using Facebook ...
4781     source Add To The Conversation Using Facebook ...
5052     source Add To The Conversation Using Facebook ...
5539     source Add To The Conversation Using Facebook ...
6891     source Add To The Conversation Using Facebook ...
7055     source Add To The Conversation Using Facebook ...
7628     source Add To The Conversation Using Facebook ...
7946     source Add To The Conversation Using Facebook ...
8386     source Add To The Conversation Using Facebook ...
8594     source Add To The Conversation Using Facebook ...
10095    source Add To The Conversation Using Facebook ...
10343    source Add To The Conversation Using Facebook ...
11975    source Add To The Conversation Using Facebook ...
12212    source Add To The Conversation Using Facebook ...
12615    source Add To The Conversation Using Facebook .

In [36]:
train = train[train['text'] != "source Add To The Conversation Using Facebook Comments"]

# Preprocessing

In [37]:
train['combined'] = train['title'] + ' ' + train['text']

In [38]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])
def tokenize_text(text):
    return word_tokenize(text)

In [39]:
def preprocess(df):
    # Convert 'combined' column to strings
    df['combined'] = df['combined'].astype(str)
    
    # Lowercase
    df['combined'] = df['combined'].str.lower()

    # Expanding Contractions
    df['combined'] = df['combined'].apply(lambda x: contractions.fix(x))

    # Removing the URLs
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

    # Removing the HTML tags
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'<.*?>', '', x))

    # Removing the numbers
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing the extra whitespaces
    df['combined'] = df['combined'].apply(lambda x: x.strip())

    # Removing the punctuation
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Removing the stopwords
    df['combined'] = df['combined'].apply(remove_stopwords)

    # Tokenization
    df['combined'] = df['combined'].apply(tokenize_text)

    return df

In [40]:
train = preprocess(train)

In [41]:
train['combined'].dropna(inplace=True)

# Model

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Import model
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from joblib import load
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold

In [43]:
X_train = train['combined']
y_train = train['label']

In [44]:
X_train.head()

0    [house, dem, aide, even, see, comeys, letter, ...
1    [flynn, hillary, clinton, big, woman, campus, ...
2    [truth, might, get, fired, truth, might, get, ...
3    [civilians, killed, single, us, airstrike, ide...
4    [iranian, woman, jailed, fictional, unpublishe...
Name: combined, dtype: object

In [45]:
vectorizer = TfidfVectorizer()
X_train = X_train.apply(' '.join)
X_train = vectorizer.fit_transform(X_train)

In [46]:
# Split train/validation data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

## Param tuning

In [47]:
# Best model
model1 = load('xg_model.joblib')
model2 = load('lg_model.joblib')
model3 = load('svm_model.joblib')
#model4 = load('nb_model.joblib')
#model5 = load('rf_model.joblib')
# Tạo mô hình ensemble
clf = VotingClassifier(estimators=[('model1', model1), ('model2', model2), ('model3', model3)], voting='hard')
clf.fit(X_train_split, y_train_split)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [48]:
# Training performance
y_train_pred = clf.predict(X_train_split)
print(classification_report(y_train_split, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8309
           1       1.00      1.00      1.00      8215

    accuracy                           1.00     16524
   macro avg       1.00      1.00      1.00     16524
weighted avg       1.00      1.00      1.00     16524



In [49]:
# Validate performance
y_val_pred = clf.predict(X_val_split)
print(classification_report(y_val_split, y_val_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2078
           1       0.98      0.99      0.98      2054

    accuracy                           0.98      4132
   macro avg       0.98      0.98      0.98      4132
weighted avg       0.98      0.98      0.98      4132



## Train on full dataset

In [50]:
clf.fit(X_train, y_train)

# Submission

In [None]:
test['combined'] = test['title'] + ' ' + test['text']
test = preprocess(test)
X_test = test['combined']
X_test = X_test.apply(' '.join)
X_test = vectorizer.transform(X_test)
id_test = test['id']
y_test_pred = clf.predict(X_test)

In [None]:
# Create a DataFrame
submission_df = pd.DataFrame({'id': id_test, 'label': y_test_pred})
# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [None]:
from joblib import dump
dump(clf, 'ens_model.joblib')