In [1]:
# pip install contractions
# pip install langdetect

In [28]:
import os
import pandas as pd
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Downloading the required wordnet data
nltk.download('punkt')
from langdetect import detect

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [128]:
# Load data
train = pd.read_csv('../Dataset_Original/UTK/train.csv')
test = pd.read_csv('../Dataset_Original/UTK/test.csv')

In [30]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1


In [31]:
# Drop author column
train.drop(columns=['author'], inplace=True)
train.head()

Unnamed: 0,id,title,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Print \r\nAn Iranian woman has been sentenced ...,1


# Cleaning

## Missing & duplicate

In [32]:
train.dropna(inplace=True)

In [33]:
train.drop_duplicates(subset='text', inplace=True, keep='first')
train.drop_duplicates(subset='title', inplace=True, keep='first')

In [34]:
num_unique_values = train['text'].nunique()
print(f'{num_unique_values}/{train.shape[0]}')

19509/19509


## Remove outliers

### Examples with only spaces (text length <=2)

In [35]:
train[train['text'].str.len() <= 2].head()

Unnamed: 0,id,title,text,label
82,82,Huma’s Weiner Dogs Hillary,,1
901,901,Internet Flasher,,1
4902,4902,Why Hillary Clinton's Campaign Is Collapsing |...,\r\n,1


In [36]:
train = train[train['text'].str.len() > 2]

### Examples with text content "source Add To The Conversation Using Facebook Comments" -> Unrelated

In [37]:
train[train['text'] == "source Add To The Conversation Using Facebook Comments"]['text']

519    source Add To The Conversation Using Facebook ...
Name: text, dtype: object

In [38]:
train = train[train['text'] != "source Add To The Conversation Using Facebook Comments"]

### Examples with non-English text

In [39]:
# Combine `text` and `title` columns into one column
train['combined'] = train['title'] + ' ' + train['text']

In [40]:
def not_english(text):
    try:
        return detect(text) != 'en'
    except:
        return False
train = train[~train['combined'].apply(not_english)]

## Preprocessing
- Lowercase
- Expanding contractions
- Removing the URLs
- Removing the HTML tags
- Removing the numbers
- Removing the extra whitespaces
- Removing the punctuation
- Removing the stopwords
- Lemmatization
- Tokenization

In [41]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])
def tokenize_text(text):
    return word_tokenize(text)

In [75]:
def preprocess(df):
    # Lowercase
    df['combined'] = df['combined'].str.lower()

    # Expanding Contractions
    df['combined'] = df['combined'].apply(lambda x: contractions.fix(x))

    # Removing the URLs
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

    # Removing the HTML tags
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'<.*?>', '', x))

    # Removing the numbers
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'\d+', '', x))

    # Removing the extra whitespaces
    df['combined'] = df['combined'].apply(lambda x: x.strip())

    # Removing the punctuation
    df['combined'] = df['combined'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Removing the stopwords
    df['combined'] = df['combined'].apply(remove_stopwords)

    # Tokenization
    df['combined'] = df['combined'].apply(tokenize_text)

    return df

In [43]:
train = preprocess(train)

In [44]:
# Remove rows with empty text again after processing
train['combined'].dropna(inplace=True)

# RandomForestClassifier

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [46]:
X_train = train['combined']
y_train = train['label']

In [54]:
X_train = X_train.apply(' '.join)

In [134]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier()),
])

# Define the grid parameters
params = [
    {
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': [50, 100],  # number of trees
        'clf__max_depth': [10, 20],  # maximum depth of the trees
        'clf__max_features': ['sqrt', 'log2'],  # number of features for best split
        'clf__min_samples_split': [2, 5, 10]  # minimum samples for a split
    }
]

# Create a StratifiedKFold object
stratified_kfold = StratifiedKFold(n_splits=10)

# Create the grid search object
grid_search = GridSearchCV(pipeline, params, cv=stratified_kfold, n_jobs=-1, refit=True, verbose=2)

# Fit the grid search object to the data to compute the optimal model
grid_search.fit(X_train, y_train)

# Print the best score and the best parameters
print("Best Score: ", grid_search.best_score_)
print("Best Parameters: ", grid_search.best_params_)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Score:  0.9092006881492753
Best Parameters:  {'clf': RandomForestClassifier(), 'clf__max_depth': 20, 'clf__max_features': 'sqrt', 'clf__min_samples_split': 5, 'clf__n_estimators': 100}


In [135]:
clf = grid_search.best_estimator_

In [136]:
# Training performance
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98     10380
           1       0.97      0.98      0.97      8662

    accuracy                           0.97     19042
   macro avg       0.97      0.97      0.97     19042
weighted avg       0.97      0.97      0.97     19042



# Submit test prediction

In [137]:
test.head()

Unnamed: 0,id,title,author,text,combined
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...","[specter, trump, loosens, tongues, purse, stri..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,"[russian, warships, ready, strike, terrorists,..."
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,"[nodapl, native, american, leaders, vow, stay,..."
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...","[tim, tebow, attempt, another, comeback, time,..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,"[keiser, report, meme, wars, e, mins, ago, vie..."


In [138]:
# Combine `text` and `title` columns into one column
test['combined'] = test['title'] + ' ' + test['text']

In [139]:
test.fillna('', inplace=True)
test = preprocess(test)
X_test = test['combined']
X_test = X_test.apply(' '.join)

In [140]:
id_test = test['id']
y_test_pred = grid_search.predict(X_test)

In [142]:
# Create a DataFrame
submission_df = pd.DataFrame({'id': id_test, 'label': y_test_pred})
# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [78]:
from joblib import dump
dump(clf, 'rf_model.joblib')

['rf_model.joblib']