# Clickbait Classifier

In [1]:
import numpy as np
from pandas import DataFrame, Series, read_csv

## 1. Get the data

In [2]:
titles = read_csv("clickBait_Data.csv")
titles_len = len(titles)
clckbt_ratio = len(titles[titles["clickbait"]==0])/titles_len
print("Database lenght : {} \nClickbait ratio: {}".format(titles_len, clckbt_ratio))
titles.head()

Database lenght : 59172 
Clickbait ratio: 0.5118637193267086


Unnamed: 0,index,id,titles,clickbait
0,6574,6575,25 Things We Learned From Julia Louis-Dreyfus ...,1
1,39655,39656,John Brennan: Trump's 'Nazi Germany' tweet to ...,0
2,44513,44514,"TruthRevolt.org: ISIS Stands For ""Israeli Secr...",0
3,44205,44206,Peak Millennial? Cities Cant Assume a Continue...,0
4,11106,11107,This Entire City Is Made Out Of Ice And It Wil...,1


## 2. Preprocessing and Analysis

### 2.1 Train test split

Split into train and test sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titles['titles'], titles["clickbait"],
                                                    test_size=.1, random_state=42)
print(len(X_train))

53254


Split again the train set into Train and Validation sets

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.1, random_state=42)
print(len(X_train))

47928


### 2.2 Words counting

We implement the following parsing operations:

- converts words to lower-case,
- expand contractions,
- remove punctuation,
- lemmatize words

It returns the list of all the words of the titles.

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tag import pos_tag
import contractions
import string

def remove_contractions(string):
    str_words = [contractions.fix(word) for word in string.lower().split()]
    return ' '.join(str_words)

def lemmatise_sentence(sentence):
    sentence = remove_contractions(sentence.lower())
    sentence = sentence.translate(str.maketrans('', '', string.punctuation+'’‘'))
    lemmatiser = WordNetLemmatizer()
    lemmatised_sentence = []
    for word, tag in pos_tag(word_tokenize(sentence.lower())):
        if tag.startswith('NN'):
            word_1 = word 
            pos = 'n'
        elif tag.startswith('VB'):
            word_1 = word
            pos = 'v'
        elif tag.startswith('CD'):
            word_1 = 'NUM'
            pos = 'a'
        else:
            word_1 = word
            pos = 'a'
        lemmatised_sentence.append(lemmatiser.lemmatize(word_1, pos))
    return lemmatised_sentence
    

def lemmatise_verbs(sentence):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('', '', string.punctuation+'’‘'))
    lemmatiser = WordNetLemmatizer()
    lemmatised_verbs = []
    for word, tag in pos_tag(word_tokenize(sentence.lower())):
        if tag.startswith('VB'):
            word_1 = word
            pos = 'v'
            lemmatised_verbs.append(lemmatiser.lemmatize(word_1, pos))
    return lemmatised_verbs

 __Make a vocabulary with the word frequencies__.

In [6]:
from collections import Counter
from nltk.corpus import stopwords

def make_dict(titles_set, collect_verbs=False, rm_stopwords=False, rm_words=False, to_del=None): 
    dictionary = Counter()
    
    tot_str = ' '
    for title in titles_set:
        tot_str+=title+' '
    
    if collect_verbs:
        dictionary = Counter(lemmatise_verbs(tot_str))
    else:
        dictionary = Counter(lemmatise_sentence(tot_str)) 
    
    # Remove english stopwords
    if rm_stopwords:
        stop_words = stopwords.words('english')
        for word in stop_words:
            del dictionary[word]
            
    # Remove some words
    if rm_words:
        for word in to_del:
            del dictionary[word]
     
    return dictionary

Generate __5 different vocabnularies__: 

- common words in all titles
- common words in clckbt titles
- common words in non-clckbait titles
- common verbs in clckbt titles
- common versb in non-clckbt titles 

The following function generates the vocabulary via the `make_dict` function, it saves it in a `.txt` file.
Finally, it loads the list of the `n_words` most common words.  

In [7]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')    # shut up bs4 URL warning


def most_common_words(n_words, txt_file, collect_verbs=False, rm_stopwords=False, rm_words=False, 
                      to_del=None, words_set=None):
    
    # try to load the dictionary from the .txt file.
    try:
        words_count = []
        file = open(txt_file, 'r')
        
        for line in range(n_words):
            words_count.append(file.readline().rstrip())
        
        file.close()
        words_count = np.array(words_count)
        print('Dictionary loaded.')
    
    # make_dict if the .txt doesn't exists
    except FileNotFoundError:
        print("There's no dictionary. Creating a new one.")
        words_count = make_dict(words_set, collect_verbs, rm_stopwords=rm_stopwords, rm_words=rm_words, to_del=to_del)
        words_count = words_count.most_common(n_words)
        file = open(txt_file, 'w+')
        
        for key, freq in words_count:
            file.write(str(key)+'\n')
        
        file.close()
        words_count = most_common_words(n_words, txt_file, collect_verbs, rm_stopwords, rm_words, to_del, words_set)
    
    return words_count

to_del = ['trump','donald','christmas','obama','president','america','harry','russian','russia','china',
          'american']

In [8]:
total_words_count = most_common_words(n_words=1000, txt_file='vocabularies/lem_total_words.txt',
                                      rm_stopwords=True, words_set=X_train)

clckbt_words_count = most_common_words(n_words=1000, txt_file='vocabularies/lem_clckbt_words.txt', 
                                       rm_stopwords=False, rm_words=True, to_del=to_del, 
                                       words_set=X_train[y_train==1])

no_clckbt_words_count = most_common_words(n_words=1000, txt_file='vocabularies/lem_no_clckbt_words.txt',
                                          rm_stopwords=False, words_set=X_train[y_train==0])

clckbt_verbs_count = most_common_words(n_words=1000, txt_file='vocabularies/lem_clckbt_verbs.txt', 
                                       collect_verbs=True, rm_stopwords=False, words_set=X_train[y_train==1])


no_clckbt_verbs_count = most_common_words(n_words=1000, txt_file='vocabularies/lem_no_clckbt_verbs.txt', 
                                          collect_verbs=True, rm_stopwords=False, words_set=X_train[y_train==0])

Dictionary loaded.
Dictionary loaded.
Dictionary loaded.
Dictionary loaded.
Dictionary loaded.


Here's the 20 most common clckbt and no_clckbt words:

In [9]:
common_words = DataFrame({'No Clickbait': no_clckbt_words_count, 'Clickbait': clckbt_words_count, 
                          'Mixed': total_words_count, 'No Clckbt vbs': no_clckbt_verbs_count,
                          'Clckbt vbs': clckbt_verbs_count})
common_words[:20].transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
No Clickbait,to,the,NUM,in,of,be,for,and,on,trump,with,as,at,say,not,have,new,from,after,by
Clickbait,NUM,be,the,to,you,of,in,this,and,for,will,that,not,have,on,do,what,your,with,it
Mixed,NUM,trump,say,new,make,people,get,year,woman,us,take,time,world,donald,look,see,thing,go,find,man
No Clckbt vbs,be,say,have,get,take,make,kill,do,find,go,trump,give,want,call,leave,help,hold,show,happen,win
Clckbt vbs,be,have,make,do,get,say,see,know,look,take,go,happen,want,find,need,believe,think,heres,use,wont


## 3. Generating the Features

Now convert features into vectors.
Define new classe, `DigitalizeTitle`. 

Features: 
- 200 most common clckbt words
- title length (in words)
- stopwords ratio
- contractions ratio
- title starts with cardinal

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from contractions import contractions_dict

def contractions_count(string):
    words = string.split()
    counter = 0
    for word in words:
        if word in list(contractions_dict.keys()):
            counter += 1
    return counter/len(words)

# Generate the feature vectors
def vectorize(title, dictionary, n_words, options=[True]*4 ):
    freq_vec = np.zeros(n_words+sum(options))
    title_words = make_dict([title], rm_stopwords=False)
    
    # options consist in num_first, n_contractions, stopword_ratio, #_tot_words
    options_arr = np.zeros(len(options))
    stop_words = np.array(stopwords.words('english'))

    options_arr[0] = 1 if 'NUM' == list(title_words.keys())[0] else 0
    options_arr[1] = contractions_count(title)
    
    for index, key in list(enumerate(dictionary[:n_words])):
        freq_vec[index] = title_words[key]
    
    for key in title_words.keys():
        options_arr[3] += title_words[key]
        if key in stop_words:
            options_arr[2] += title_words[key]
        
    options_arr[2] = options_arr[2]/options_arr[3]
    
    if sum(options)>1:
        freq_vec[n_words:] = options_arr[options]
    
    return freq_vec



# CLASS DigitalizeTitle
class DigitalizeTitle(BaseEstimator, TransformerMixin):
    
    def __init__(self, clckbt_dict, options=[True]*4): 
        self.clckbt_dict = clckbt_dict
        self.options = options
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        clckbt_words_vec = np.zeros((len(X), 200+sum(self.options)))
        
        for index, title in enumerate(X):
            clckbt_words_vec[index] = vectorize(title, self.clckbt_dict, 200, self.options)
            #clckbt_words_vec[index, 200:] = count_words(title)
            
        return clckbt_words_vec

Define feature matrix

In [11]:
X_train_prepared = DigitalizeTitle(clckbt_dict=clckbt_words_count, options=[True]*4).fit_transform(X_train)
X_train_mini, y_train_mini = X_train_prepared[:1000], y_train[:1000]

## 4. Train some classifier

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}
    

### 4.1 Train Naive Bayes on only features words

In [13]:
from sklearn.naive_bayes import MultinomialNB
from utilitiesuseful_functions import print_scores

mnb_clf = MultinomialNB()

mnb_clf_cv = cross_validate(mnb_clf, X_train_mini[:,:200], y_train_mini, cv=5, scoring=scorers, n_jobs=5)
print_scores(mnb_clf_cv)

accuracy: 0.688
precision: 0.735
recall: 0.592


In [14]:
mnb_clf_cv = cross_validate(mnb_clf, X_train_prepared[:,:200], y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(mnb_clf_cv)

accuracy: 0.721
precision: 0.766
recall: 0.617


In [15]:
mnb_clf.fit(X_train_prepared[:,:200], y_train)
probabilities = mnb_clf.predict_proba(X_train_prepared[:,:200])[:, 1]
probabilities = probabilities.reshape((len(probabilities), 1))

### 4.2 Train Random Forest on Naive Bayes probabilities and non-word features

In [16]:
X_train_forest = np.concatenate([probabilities, X_train_prepared[:, 200:]], axis=1)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from utilities import print_scores

forest_clf = RandomForestClassifier(random_state=42)

forest_cv = cross_validate(forest_clf, X_train_forest[:1000], y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(forest_cv)

accuracy: 0.684
precision: 0.697
recall: 0.650


In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distrib = {"n_estimators": list(range(1,500)), "max_depth": reciprocal(2,100)}

rnd_srch_forest = RandomizedSearchCV(forest_clf, param_distributions=param_distrib,
                                     cv=5, scoring='accuracy', random_state=42,
                                     n_iter=100, verbose=5, n_jobs=-1)

rnd_srch_forest.fit(X_train_forest[:1000], y_train_mini)

print('Best score: %.3f' % rnd_srch_forest.best_score_)
print('Best params:', rnd_srch_forest.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   25.3s


Best score: 0.728
Best params: {'max_depth': 2.4005654856361716, 'n_estimators': 188}


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   28.3s finished


In [19]:
forest_clf = rnd_srch_forest.best_estimator_

forest_cv = cross_validate(forest_clf, X_train_forest, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(forest_cv)

accuracy: 0.732
precision: 0.805
recall: 0.594


### 4.4 Train SVM on Naive Bayes probabilities and non-word features

In [20]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

svc_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

svc_cv = cross_validate(svc_clf, X_train_forest[:1000], y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(svc_cv)

accuracy: 0.731
precision: 0.809
recall: 0.606


In [21]:
param_distrib = {'svm__kernel': ['rbf', 'poly'],
                 'svm__C': uniform(1,20),
                 'svm__gamma': reciprocal(.0001, .1),
                }
                 

rnd_srch_svc = RandomizedSearchCV(svc_clf, param_distributions=param_distrib,
                                  cv=5, scoring='accuracy', n_iter=1000, verbose=5, n_jobs=-1)

rnd_srch_svc.fit(X_train_forest[:1000], y_train_mini)

print('Best score: %.3f' % rnd_srch_svc.best_score_)
print('Best params:', rnd_srch_svc.best_params_)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 3088 tasks      | elapsed:   14.9s


Best score: 0.736
Best params: {'svm__C': 11.987095327524536, 'svm__gamma': 0.00701925543856288, 'svm__kernel': 'rbf'}


[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:   23.7s finished


In [22]:
svc_clf = rnd_srch_svc.best_estimator_

svc_clf_cv = cross_validate(svc_clf, X_train_forest, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(svc_clf_cv)

accuracy: 0.733
precision: 0.829
recall: 0.570


### 4.5 Train naive Random Forest on all the features

In [23]:
forest_clf_v1 = RandomForestClassifier(random_state=42)

forest_cv_v1 = cross_validate(forest_clf_v1, X_train_mini, y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(forest_cv_v1)

accuracy: 0.685
precision: 0.714
recall: 0.620


In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distrib = {"n_estimators": list(range(1,500)), "max_depth": reciprocal(2,100)}

rnd_srch_forest_v1 = RandomizedSearchCV(forest_clf_v1, param_distributions=param_distrib,
                                     cv=5, scoring='accuracy', random_state=42,
                                     n_iter=100, verbose=5, n_jobs=-1)

rnd_srch_forest_v1.fit(X_train_mini, y_train_mini)

print('Best score: %.3f' % rnd_srch_forest_v1.best_score_)
print('Best params:', rnd_srch_forest_v1.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   32.4s finished


Best score: 0.723
Best params: {'max_depth': 44.00626778554037, 'n_estimators': 379}


In [25]:
forest_clf_v1 = rnd_srch_forest_v1.best_estimator_

forest_cv_v1 = cross_validate(forest_clf_v1, X_train_prepared, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(forest_cv_v1)

accuracy: 0.802
precision: 0.875
recall: 0.692


### 4.5 Train naive SVM on all the features

In [26]:
svc_clf_v1 = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

svc_cv_v1 = cross_validate(svc_clf_v1, X_train_mini, y_train_mini, cv=5, scoring=scorers, n_jobs=-1)
print_scores(svc_cv_v1)

accuracy: 0.695
precision: 0.702
recall: 0.680


In [29]:
param_distrib = {'svm__kernel': ['rbf', 'poly'],
                 'svm__C': uniform(1,20),
                 'svm__gamma': reciprocal(.0001, .1),
                }

rnd_srch_svc_v1 = RandomizedSearchCV(svc_clf_v1, param_distributions=param_distrib,
                                  cv=5, scoring='accuracy', n_iter=100, verbose=5, n_jobs=-1)

rnd_srch_svc_v1.fit(X_train_mini, y_train_mini)

print('Best score: %.3f' % rnd_srch_svc_v1.best_score_)
print('Best params:', rnd_srch_svc_v1.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   17.7s finished


Best score: 0.709
Best params: {'svm__C': 3.6108754588885907, 'svm__gamma': 0.0005731628906640736, 'svm__kernel': 'rbf'}


In [None]:
svc_clf_v1 = rnd_srch_svc_v1.best_estimator_

svc_clf_cv_v1 = cross_validate(svc_clf_v1, X_train_prepared, y_train, cv=5, scoring=scorers, n_jobs=5)
print_scores(svc_clf_cv_v1)

## 5 Test on Validation set

Evaluate the three classifiers we've built on the Validation set to see which performs better

### Prepare titles

In [40]:
X_val_prepared = DigitalizeTitle(clckbt_dict=clckbt_words_count, options=[True]*4).transform(X_val)

#### Naive Bayes

In [42]:
y_val_pred = mnb_clf.predict(X_val_prepared[:,:200])

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

Score on the test set: 0.723
Precision: 0.759
Recall: 0.625


In [46]:
y_val_proba = mnb_clf.predict_proba(X_val_prepared[:,:200])[:,1]
y_val_proba = y_val_proba.reshape((len(y_val), 1))
X_val_forest = np.concatenate([y_val_proba, X_val_prepared[:, 200:]], axis=1)

#### Random Forest

In [48]:
forest_clf.fit(X_train_forest, y_train)
y_val_pred = forest_clf.predict(X_val_forest)

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

Score on the test set: 0.726
Precision: 0.791
Recall: 0.588


#### SVC

In [49]:
svc_clf.fit(X_train_forest, y_train)
y_val_pred = svc_clf.predict(X_val_forest)

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

Score on the test set: 0.731
Precision: 0.817
Recall: 0.570


#### Naive Random Forest

In [51]:
forest_clf_v1.fit(X_train_prepared, y_train)
y_val_pred = forest_clf_v1.predict(X_val_prepared)

print('Score on the test set: %.3f' % accuracy_score(y_val, y_val_pred))
print('Precision: %.3f' % precision_score(y_val, y_val_pred))
print('Recall: %.3f' % recall_score(y_val, y_val_pred))

Score on the test set: 0.817
Precision: 0.895
Recall: 0.705


### 6. Final evaluation on the Test Set

Choose the first classifier that performs better than the others

In [52]:
X_test_prep = DigitalizeTitle(clckbt_dict=clckbt_words_count, options=[True]*4).transform(X_test)
y_test_pred = forest_clf_v1.predict(X_test_prep)

print('Score on the test set: %.3f' % accuracy_score(y_test, y_test_pred))
print('Precision: %.3f' % precision_score(y_test, y_test_pred))
print('Recall: %.3f' % recall_score(y_test, y_test_pred))

Score on the test set: 0.820
Precision: 0.897
Recall: 0.719


### 7. Export the model for later use

In [53]:
from joblib import dump

filename = 'clickbait_classifier_v2.joblib'
dump(forest_clf_v1, filename)

['clickbait_classifier_v2.joblib']