# Clickbait Classifier

In [1]:
import numpy as np
from pandas import DataFrame, Series, read_csv

## 1. Get the data

In [2]:
titles = read_csv("clickBait_Data.csv")
titles_len = len(titles)
clckbt_ratio = len(titles[titles["clickbait"]==0])/titles_len
print("Database lenght : {} \nClickbait ratio: {}".format(titles_len, clckbt_ratio))
titles.head()

Database lenght : 59172 
Clickbait ratio: 0.5118637193267086


Unnamed: 0,index,id,titles,clickbait
0,6574,6575,25 Things We Learned From Julia Louis-Dreyfus ...,1
1,39655,39656,John Brennan: Trump's 'Nazi Germany' tweet to ...,0
2,44513,44514,"TruthRevolt.org: ISIS Stands For ""Israeli Secr...",0
3,44205,44206,Peak Millennial? Cities Cant Assume a Continue...,0
4,11106,11107,This Entire City Is Made Out Of Ice And It Wil...,1


Some clickbait headlines ...

In [3]:
for title in titles['titles'][titles['clickbait']==1][:10]:
    print(title)

25 Things We Learned From Julia Louis-Dreyfus In 2014
This Entire City Is Made Out Of Ice And It Will Blow Your Mind
35 Gifts For the Wanderlust-Obsessed Person In Your Life
This Dudes Theory On Life After Death Has Gone Viral After Blowing Everyones Mind
Cat owners are more likely to be into BDSM than everyone else The list
Howard Schultz Stepping Down as Starbucks CEO to Focus on Higher-End Shops
This Brother Sent Relatives Christmas Cards Saying His Sister Was Dating Chief Keef
ESPN Layoffs at Leading Edge of the Coming ‘Sports Bubble’
You probably know to ask yourself, What do I want? Heres a way better question
Jerry Brown: ‘We’re Not Going to Bring Stupid Lawsuits’ Against Trump Wall


## 2. Preprocessing and Analysis

Split into train, validation and test sets

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titles['titles'], titles["clickbait"],
                                                    test_size=.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.1, random_state=42)

### Preprocessing

Parse each headline string:

- converts to lower-case,
- expand contractions,
- remove punctuation,
- lemmatize words



In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
from contractions import contractions_dict
from nltk import word_tokenize
from nltk.tag import pos_tag
import contractions
import string

_lem = WordNetLemmatizer()
contractions_set = set(contr.lower() for contr in contractions_dict)


def remove_contractions(string):
    string = string.lower()
    contr_num = sum(1 for contr in contractions_set if contr in string)
    parsed_string = ' '.join(contractions.fix(word) for word in string.split())
    return parsed_string, contr_num


def lemmatise_sentence(sentence):
    
    # remove contarctions and convert to lower case
    sentence, contr_num = remove_contractions(sentence.lower())
    
    # remove punctuation
    sentence = sentence.translate(str.maketrans('', '', string.punctuation+'’‘'))  
    
    # lemmatize words
    lemm_str = ""
    for word, tag in pos_tag(word_tokenize(sentence.lower())):
        if tag.startswith('NN'):
            word_1 = word 
            pos = 'n'
        elif tag.startswith('VB'):
            word_1 = word
            pos = 'v'
        elif tag.startswith('CD'):
            word_1 = 'NUM'
            pos = 'a'
        else:
            word_1 = word
            pos = 'a'
        lemm_str += ' '+_lem.lemmatize(word_1, pos)
    
    return lemm_str, contr_num

Build a class that parse the text

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class ParseString(BaseEstimator, TransformerMixin):
    def __init__(self):
        self = True
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_prep, contr_list = [], []
        for string in X:
            lemm_str, contr_num = lemmatise_sentence(string)
            X_prep.append(lemm_str)
            contr_list.append(contr_num)
        return DataFrame({"headline": X_prep, "contr num":contr_list})

In [7]:
X_train_prep = ParseString().fit_transform(X=X_train)

__Generate the vocabulary__

In [8]:
import warnings
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')    # shut up bs4 URL warning


def make_vocabulary(X, length, rm_words=False, to_del_words_list=None):
    vectorizer = CountVectorizer(max_features=length)
    vectorizer.fit(X)
    vocab = vectorizer.get_feature_names()
    if rm_words:
        for word in to_del_words_list:
            if word in vocab:
                vocab.remove(word)
    return vocab


def save_vocabulary(words_list, txt_file):
    file = open(txt_file, 'w+')
    for word in words_list:
            file.write(str(key)+'\n')
    file.close()
    
    print('Vocabulary stored in "{}"'.format(txt_file))
    

def load_vocabulary(length, txt_file, to_del=None):
    file = open(txt_file, 'r')
    vocab = np.array([file.readline().rstrip().lower() for line in range(length)])
    file.close()
    print('Dictionary loaded.')
    return vocab
    

to_del = ['trump','donald','christmas','obama','president','america','harry','russian','russia','china',
          'american']

Generate __3 different vocabnularies__: 
- common words in all titles
- common words in clckbt titles
- common words in non-clckbait titles

In [9]:
X_train_clckb = X_train_prep.headline.values[y_train==1]
X_train_noclckbt = X_train_prep.headline.values[y_train==0]

full_vocab = make_vocabulary(X_train_prep.headline, length=20)
clckbt_vocab = make_vocabulary(X_train_clckb, length=21, rm_words=True, to_del_words_list=to_del)
no_clckbt_vocab = make_vocabulary(X_train_noclckbt, length=20)

20 most common clckbt and no_clckbt words (NB: in alphabetical order!):

In [10]:
common_words = DataFrame({'No Clickbait': no_clckbt_vocab[:20], 
                          'Clickbait': clckbt_vocab[:20], 
                          'Full': full_vocab[:20]})
common_words.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
No Clickbait,after,and,as,at,be,by,for,from,have,in,new,not,num,of,on,say,the,to,trump,with
Clickbait,and,be,do,for,have,in,it,not,num,of,on,that,the,this,to,what,will,with,you,your
Full,and,at,be,do,for,have,in,it,not,num,of,on,that,the,this,to,trump,will,with,you


## 3. Create the Pipeline

Generate features array. Features: 
- 200 most common clckbt words
- title length (in words)
- stopwords ratio
- contractions ratio
- title starts with cardinal

In [11]:
from scipy.sparse import coo_matrix, hstack
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from collections import Counter

# make bag of words
vocab = load_vocabulary(200, 'vocabularies/lem_clckbt_words.txt')



class PreProcess(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        self.vectorizer = CountVectorizer(vocabulary=self.vocabulary)
        self.stopwords_set = set(stopwords.words('english'))
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # bag of words
        X_bag = self.vectorizer.transform(X.headline)
        # meta data
        meta_arr = []
        for i in range(len(X)):
            d = Counter(X.headline.iloc[i].split())
            num_flag = 1 if list(d)[0]=='NUM' else 0
            n_of_words = sum(d.values())
            contr_r = X['contr num'].iloc[i]/n_of_words
            stop_r = sum(d[key] for key in set(d.keys())&self.stopwords_set) / n_of_words
            meta_arr.append([num_flag, contr_r, stop_r, n_of_words])
        meta_arr = coo_matrix(meta_arr)
        return hstack([X_bag, meta_arr])

    
    
full_pipeline = Pipeline([
    ("parse text", ParseString()),
    ("gen features", PreProcess(vocabulary=vocab))
])


Dictionary loaded.


In [12]:
X_train_prep = full_pipeline.fit_transform(X_train)
X_train_prep.shape

(47928, 204)

In [13]:
X_train_mini = X_train_prep.toarray()[:1000]
y_train_mini = y_train[:1000]

## 4. Train some classifier

In [14]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}
    

### 4.1 Train Naive Bayes on only features words

In [15]:
from sklearn.naive_bayes import MultinomialNB
from utilities import print_scores

mnb_clf = MultinomialNB()

mnb_clf_cv = cross_validate(mnb_clf, X_train_mini[:,:200], y_train_mini, cv=5, scoring=scorers, n_jobs=5)
print_scores(mnb_clf_cv)

accuracy: 0.688
precision: 0.735
recall: 0.592


In [16]:
mnb_clf_cv = cross_validate(mnb_clf, X_train_prep.toarray()[:,:200], y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(mnb_clf_cv)

accuracy: 0.721
precision: 0.766
recall: 0.617


In [17]:
mnb_clf.fit(X_train_prep.toarray()[:,:200], y_train)
probabilities = mnb_clf.predict_proba(X_train_prep.toarray()[:,:200])[:, 1]
probabilities = probabilities.reshape((len(probabilities), 1))

### 4.2 Train Random Forest on Naive Bayes probabilities and non-word features

In [18]:
X_train_forest = np.concatenate([probabilities, X_train_prep.toarray()[:, 200:]], axis=1)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from utilities import print_scores

forest_clf = RandomForestClassifier(random_state=42)

forest_cv = cross_validate(forest_clf, X_train_forest[:1000], y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(forest_cv)

accuracy: 0.697
precision: 0.717
recall: 0.652


In [20]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distrib = {"n_estimators": list(range(1,500)), "max_depth": reciprocal(2,100)}

rnd_srch_forest = RandomizedSearchCV(forest_clf, param_distributions=param_distrib,
                                     cv=5, scoring='accuracy', random_state=42,
                                     n_iter=100, verbose=5, n_jobs=-1)

rnd_srch_forest.fit(X_train_forest[:1000], y_train_mini)

print('Best score: %.3f' % rnd_srch_forest.best_score_)
print('Best params:', rnd_srch_forest.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 241 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 367 tasks      | elapsed:   21.0s


Best score: 0.732
Best params: {'max_depth': 4.367870584636595, 'n_estimators': 55}


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   28.7s finished


In [21]:
forest_clf = rnd_srch_forest.best_estimator_

forest_cv = cross_validate(forest_clf, X_train_forest, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(forest_cv)

accuracy: 0.736
precision: 0.813
recall: 0.596


### 4.4 Train SVM on Naive Bayes probabilities and non-word features

In [22]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

svc_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

svc_cv = cross_validate(svc_clf, X_train_forest[:1000], y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(svc_cv)

accuracy: 0.731
precision: 0.801
recall: 0.616


In [23]:
param_distrib = {'svm__kernel': ['rbf', 'poly'],
                 'svm__C': uniform(1,20),
                 'svm__gamma': reciprocal(.0001, .1),
                }
                 

rnd_srch_svc = RandomizedSearchCV(svc_clf, param_distributions=param_distrib,
                                  cv=5, scoring='accuracy', n_iter=1000, verbose=5, n_jobs=-1)

rnd_srch_svc.fit(X_train_forest[:1000], y_train_mini)

print('Best score: %.3f' % rnd_srch_svc.best_score_)
print('Best params:', rnd_srch_svc.best_params_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 1316 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 4196 tasks      | elapsed:   14.9s


Best score: 0.734
Best params: {'svm__C': 9.46086466149087, 'svm__gamma': 0.008532209584895145, 'svm__kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   17.6s finished


In [24]:
svc_clf = rnd_srch_svc.best_estimator_

svc_clf_cv = cross_validate(svc_clf, X_train_forest, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(svc_clf_cv)

accuracy: 0.733
precision: 0.826
recall: 0.573


### 4.5 Train naive Random Forest on all the features

In [25]:
forest_clf_v1 = RandomForestClassifier(random_state=42)

forest_cv_v1 = cross_validate(forest_clf_v1, X_train_mini, y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(forest_cv_v1)

accuracy: 0.684
precision: 0.715
recall: 0.618


In [26]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distrib = {"n_estimators": list(range(1,500)), "max_depth": reciprocal(2,100)}

rnd_srch_forest_v1 = RandomizedSearchCV(forest_clf_v1, param_distributions=param_distrib,
                                     cv=5, scoring='accuracy', random_state=42,
                                     n_iter=100, verbose=5, n_jobs=-1)

rnd_srch_forest_v1.fit(X_train_mini, y_train_mini)

print('Best score: %.3f' % rnd_srch_forest_v1.best_score_)
print('Best params:', rnd_srch_forest_v1.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 178 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 304 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 466 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   30.5s finished


Best score: 0.728
Best params: {'max_depth': 51.918851006225275, 'n_estimators': 294}


In [27]:
forest_clf_v1 = rnd_srch_forest_v1.best_estimator_

forest_cv_v1 = cross_validate(forest_clf_v1, X_train_prep, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(forest_cv_v1)

accuracy: 0.805
precision: 0.871
recall: 0.705


### 4.5 Train naive SVM on all the features

In [None]:
svc_clf_v1 = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

svc_cv_v1 = cross_validate(svc_clf_v1, X_train_mini, y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(svc_cv_v1)

In [None]:
param_distrib = {'svm__kernel': ['rbf', 'poly'],
                 'svm__C': uniform(1,20),
                 'svm__gamma': reciprocal(.0001, .1),
                }

rnd_srch_svc_v1 = RandomizedSearchCV(svc_clf_v1, param_distributions=param_distrib,
                                  cv=5, scoring='accuracy', n_iter=100, verbose=5, n_jobs=-1)

rnd_srch_svc_v1.fit(X_train_mini, y_train_mini)

print('Best score: %.3f' % rnd_srch_svc_v1.best_score_)
print('Best params:', rnd_srch_svc_v1.best_params_)

In [None]:
svc_clf_v1 = rnd_srch_svc_v1.best_estimator_

svc_clf_cv_v1 = cross_validate(svc_clf_v1, X_train_prep.toarray(), y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(svc_clf_cv_v1)

## 5 Test on Validation set

Evaluate the three classifiers we've built on the Validation set to see which performs better

### Prepare titles

In [28]:
X_val_prep = full_pipeline.transform(X_val)

#### Naive Bayes

In [None]:
y_val_pred = mnb_clf.predict(X_val_prep.toarray()[:,:200])

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

In [None]:
y_val_proba = mnb_clf.predict_proba(X_val_prep.toarray()[:,:200])[:,1]
y_val_proba = y_val_proba.reshape((len(y_val), 1))
X_val_forest = np.concatenate([y_val_proba, X_val_prep.toarray()[:, 200:]], axis=1)

#### Random Forest

In [None]:
forest_clf.fit(X_train_forest, y_train)
y_val_pred = forest_clf.predict(X_val_forest)

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

#### SVC

In [None]:
svc_clf.fit(X_train_forest, y_train)
y_val_pred = svc_clf.predict(X_val_forest)

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

#### Naive Random Forest

In [29]:
forest_clf_v1.fit(X_train_prep, y_train)
y_val_pred = forest_clf_v1.predict(X_val_prep)

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

Score on the test set: 0.818
Precision: 0.882
Recall: 0.720


### 6. Final evaluation on the Test Set

Choose the Random Forest

In [30]:
X_test_prep = full_pipeline.transform(X_test)
y_test_pred = forest_clf_v1.predict(X_test_prep)

print('Score on the test set: %.3f' % accuracy_score(y_test, y_test_pred))
print('Precision: %.3f' % precision_score(y_test, y_test_pred))
print('Recall: %.3f' % recall_score(y_test, y_test_pred))

Score on the test set: 0.824
Precision: 0.894
Recall: 0.731


### 7. Export the model for later use

In [None]:
from joblib import dump

filename = 'clickbait_classifier_v2.joblib'
#dump(forest_clf_v1, filename)