In [1]:
data_folder = '../data/'

import os
import string
import random
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## Load and look into the dataset

In [2]:
train_data = pd.read_csv(os.path.join(data_folder, 'train_data.csv'))

In [3]:
test_data = pd.read_csv(os.path.join(data_folder, 'test_data_labeled.csv'))

In [4]:
train_data.head()

Unnamed: 0,title,text,label
0,PGA Spokesman and Tiger Woods' Agent Deny Drug...,Nicaraguan officials have appealed for witness...,irrelevant
1,Newly-Found Document Holds Eyewitness Account ...,"Dylan Thomas, who has been given the nickname ...",irrelevant
2,Dog found abandoned outside railway station wi...,It's being reported that one of the Batmobiles...,irrelevant
3,‘Evocative shape': Is Vladimir Putin trolling ...,"Boston - So lately, tongues have been wagging ...",irrelevant
4,‘Jihadi John’: The Islamic State killer behind...,LONDON — The identity of the masked executione...,agree


In [5]:
train_data.label.value_counts()

irrelevant    31045
debate         8909
agree          3678
clickbait       840
Name: label, dtype: int64

## Preprocess text

We do it outside of the pipeline because we didn't iterate too much into this

In [8]:
filter_stopwords = True
remove_punkt = True

en_stopwords = stopwords.words('english')
tokenizer =  WordPunctTokenizer()

def preprocess(text, stopwords=False, remove_punkt=False, truncate=None):
    tokens = tokenizer.tokenize(text)
    tokens = [tok.lower() for tok in tokens]
    if stopwords:
        tokens = [tok for tok in tokens if tok not in en_stopwords]
    if remove_punkt:
        tokens = [tok for tok in tokens if tok not in string.punctuation]
    if truncate:
        tokens = tokens[:truncate]
    #tokens = [PorterStemmer().stem(tok) for tok in tokens]
    text_preprocessed = ' '.join(tokens)
    return text_preprocessed

In [9]:
train_data['text_preprocessed'] = train_data['text'].apply(preprocess, args=(filter_stopwords, remove_punkt, None))
train_data['title_preprocessed'] = train_data['title'].apply(preprocess, args=(filter_stopwords, remove_punkt, None))

test_data['text_preprocessed'] = test_data['text'].apply(preprocess, args=(filter_stopwords, remove_punkt, None))
test_data['title_preprocessed'] = test_data['title'].apply(preprocess, args=(filter_stopwords, remove_punkt, None))


## Addition of extra features

To get a decent baseline to this problem it helps to generate some engineered features

### Feature 1 - overlap of headline with title 

Very naïve approach to the problem, mostly to the `irrelevant` category

In [10]:
def overlap_feature(df):
    headline = df['title_preprocessed']
    headline_toks = headline.split()
    headline_num_toks = len(headline_toks)
    count_toks = 0
    for tok in headline_toks:
        if tok in df['text_preprocessed']:
            count_toks += 1
    return 1.0 * count_toks / headline_num_toks


In [11]:
train_data['overlap'] = train_data.apply(overlap_feature, axis=1)
test_data['overlap'] = test_data.apply(overlap_feature, axis=1)
train_data.head()

Unnamed: 0,title,text,label,text_preprocessed,title_preprocessed,overlap
0,PGA Spokesman and Tiger Woods' Agent Deny Drug...,Nicaraguan officials have appealed for witness...,irrelevant,nicaraguan officials appealed witnesses meteor...,pga spokesman tiger woods agent deny drug susp...,0.0
1,Newly-Found Document Holds Eyewitness Account ...,"Dylan Thomas, who has been given the nickname ...",irrelevant,dylan thomas given nickname spider man friends...,newly found document holds eyewitness account ...,0.111111
2,Dog found abandoned outside railway station wi...,It's being reported that one of the Batmobiles...,irrelevant,reported one batmobiles currently detroit batm...,dog found abandoned outside railway station su...,0.0
3,‘Evocative shape': Is Vladimir Putin trolling ...,"Boston - So lately, tongues have been wagging ...",irrelevant,boston lately tongues wagging due recent story...,‘ evocative shape ': vladimir putin trolling w...,0.1
4,‘Jihadi John’: The Islamic State killer behind...,LONDON — The identity of the masked executione...,agree,london — identity masked executioner clutching...,‘ jihadi john ’: islamic state killer behind m...,0.363636


### Feature 2 - clickbait related words 

Once again, a very simple lookup into the "clickbait" texts leads to a lot of rumors/frauds/debunked myths, etc, so we add these counts to see if they help

In [12]:
def clickbait_manual_features(df, where='both'):
    clickbait_words_text_not_headline = [
        'fake',
        'fraud',
        'hoax',
        'false',
        'urban myth',
        'unsupported',
        'debunk',
        'viral',
        'doubt',
        'rumor'
    ]
    
    headline = df['title_preprocessed']
    text = df['text_preprocessed']
    
    counts = {
        'both': 0,
        'head': 0,
        'body': 0,
        'none': 0,
    }
    for w in clickbait_words_text_not_headline:
        if w in text and w in headline:
            counts['both'] += 1
        elif w in headline:
            counts['head'] += 1
        elif w in text:
            counts['body'] += 1
        else:
            counts['none'] += 1

    return counts[where]

In [13]:
train_data['clickbait_word_count_both'] = train_data.apply(clickbait_manual_features, axis=1, args=('both', ))
train_data['clickbait_word_count_head'] = train_data.apply(clickbait_manual_features, axis=1, args=('head', ))
train_data['clickbait_word_count_body'] = train_data.apply(clickbait_manual_features, axis=1, args=('body', ))

test_data['clickbait_word_count_both'] = test_data.apply(clickbait_manual_features, axis=1, args=('both', ))
test_data['clickbait_word_count_head'] = test_data.apply(clickbait_manual_features, axis=1, args=('head', ))
test_data['clickbait_word_count_body'] = test_data.apply(clickbait_manual_features, axis=1, args=('body', ))

train_data.head()

Unnamed: 0,title,text,label,text_preprocessed,title_preprocessed,overlap,clickbait_word_count_both,clickbait_word_count_head,clickbait_word_count_body
0,PGA Spokesman and Tiger Woods' Agent Deny Drug...,Nicaraguan officials have appealed for witness...,irrelevant,nicaraguan officials appealed witnesses meteor...,pga spokesman tiger woods agent deny drug susp...,0.0,0,0,0
1,Newly-Found Document Holds Eyewitness Account ...,"Dylan Thomas, who has been given the nickname ...",irrelevant,dylan thomas given nickname spider man friends...,newly found document holds eyewitness account ...,0.111111,0,0,0
2,Dog found abandoned outside railway station wi...,It's being reported that one of the Batmobiles...,irrelevant,reported one batmobiles currently detroit batm...,dog found abandoned outside railway station su...,0.0,0,0,0
3,‘Evocative shape': Is Vladimir Putin trolling ...,"Boston - So lately, tongues have been wagging ...",irrelevant,boston lately tongues wagging due recent story...,‘ evocative shape ': vladimir putin trolling w...,0.1,0,0,3
4,‘Jihadi John’: The Islamic State killer behind...,LONDON — The identity of the masked executione...,agree,london — identity masked executioner clutching...,‘ jihadi john ’: islamic state killer behind m...,0.363636,0,0,0


### Feature 3 - negatives  

Negatives carry some meaning so we add a bunch of counts related to it.

In [14]:
def negative_manual_features(df, where='body'):
    negation_particles = [
        'not',
        'n\'t'
    ]
    
    headline = df['title']
    text = df['text']
    
    counts = {
        'head': 0,
        'body': 0
    }
    for w in negation_particles:
        if w in headline:
            counts['head'] += 1
        if w in text:
            counts['body'] += 1

    return counts[where]

In [15]:
train_data['negation_body'] = train_data.apply(negative_manual_features, axis=1, args=('body', ))
train_data['negation_head'] = train_data.apply(negative_manual_features, axis=1, args=('head', ))

test_data['negation_body'] = test_data.apply(negative_manual_features, axis=1, args=('body', ))
test_data['negation_head'] = test_data.apply(negative_manual_features, axis=1, args=('head', ))

train_data.head()

Unnamed: 0,title,text,label,text_preprocessed,title_preprocessed,overlap,clickbait_word_count_both,clickbait_word_count_head,clickbait_word_count_body,negation_body,negation_head
0,PGA Spokesman and Tiger Woods' Agent Deny Drug...,Nicaraguan officials have appealed for witness...,irrelevant,nicaraguan officials appealed witnesses meteor...,pga spokesman tiger woods agent deny drug susp...,0.0,0,0,0,1,0
1,Newly-Found Document Holds Eyewitness Account ...,"Dylan Thomas, who has been given the nickname ...",irrelevant,dylan thomas given nickname spider man friends...,newly found document holds eyewitness account ...,0.111111,0,0,0,0,0
2,Dog found abandoned outside railway station wi...,It's being reported that one of the Batmobiles...,irrelevant,reported one batmobiles currently detroit batm...,dog found abandoned outside railway station su...,0.0,0,0,0,2,0
3,‘Evocative shape': Is Vladimir Putin trolling ...,"Boston - So lately, tongues have been wagging ...",irrelevant,boston lately tongues wagging due recent story...,‘ evocative shape ': vladimir putin trolling w...,0.1,0,0,3,2,0
4,‘Jihadi John’: The Islamic State killer behind...,LONDON — The identity of the masked executione...,agree,london — identity masked executioner clutching...,‘ jihadi john ’: islamic state killer behind m...,0.363636,0,0,0,1,0


### Feature 4 - tfidf cosine similarity  

Although it strikes a bit on top of the overlap, we add it to check the difference

In [16]:
def cosine_similarity_feature(df, vectorizer):
    headline = df['title_preprocessed']
    text = df['text_preprocessed']

    head_tfidf = vectorizer.transform([headline])
    text_tfidf = vectorizer.transform([text])

    cosineSimilarities = cosine_similarity(head_tfidf, text_tfidf).flatten()
    return cosineSimilarities[0]


In [17]:
vocabulary = set(' '.join(train_data.text_preprocessed.to_list()).split())
vec = TfidfVectorizer(vocabulary=vocabulary)
vec.fit(train_data.text_preprocessed)

train_data['cos_similariry'] = train_data.apply(cosine_similarity_feature, axis=1, args=(vec, ))
test_data['cos_similariry'] = test_data.apply(cosine_similarity_feature, axis=1, args=(vec, ))

train_data.head()

Unnamed: 0,title,text,label,text_preprocessed,title_preprocessed,overlap,clickbait_word_count_both,clickbait_word_count_head,clickbait_word_count_body,negation_body,negation_head,cos_similariry
0,PGA Spokesman and Tiger Woods' Agent Deny Drug...,Nicaraguan officials have appealed for witness...,irrelevant,nicaraguan officials appealed witnesses meteor...,pga spokesman tiger woods agent deny drug susp...,0.0,0,0,0,1,0,0.0
1,Newly-Found Document Holds Eyewitness Account ...,"Dylan Thomas, who has been given the nickname ...",irrelevant,dylan thomas given nickname spider man friends...,newly found document holds eyewitness account ...,0.111111,0,0,0,0,0,0.00563
2,Dog found abandoned outside railway station wi...,It's being reported that one of the Batmobiles...,irrelevant,reported one batmobiles currently detroit batm...,dog found abandoned outside railway station su...,0.0,0,0,0,2,0,0.0
3,‘Evocative shape': Is Vladimir Putin trolling ...,"Boston - So lately, tongues have been wagging ...",irrelevant,boston lately tongues wagging due recent story...,‘ evocative shape ': vladimir putin trolling w...,0.1,0,0,3,2,0,0.010286
4,‘Jihadi John’: The Islamic State killer behind...,LONDON — The identity of the masked executione...,agree,london — identity masked executioner clutching...,‘ jihadi john ’: islamic state killer behind m...,0.363636,0,0,0,1,0,0.157111


## Oversample the data to get a more balanced dataset

Oversampling helps a bit. If you change the amount of times you oversample each label, you get quite an impact in the F1-score to be optimized. 

In [18]:
debate = train_data[train_data.label=='debate']
agree = train_data[train_data.label=='agree']
clickbait = train_data[train_data.label=='clickbait']
irrelevant = train_data[train_data.label=='irrelevant']

train_data_oversample = pd.concat((
    debate, debate, debate,
    agree, agree, agree, agree, agree, agree, agree, agree,
    clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, clickbait, 
    irrelevant
)).sample(frac=1, random_state=42)

train_data_oversample.head()


Unnamed: 0,title,text,label,text_preprocessed,title_preprocessed,overlap,clickbait_word_count_both,clickbait_word_count_head,clickbait_word_count_body,negation_body,negation_head,cos_similariry
14851,Former British Rapper Reportedly Under Investi...,British intelligence agencies MI5 and MI6 have...,debate,british intelligence agencies mi5 mi6 identifi...,former british rapper reportedly investigation...,0.75,0,0,0,0,0,0.249391
9161,"Catholic Priest Dead For 48 Minutes, Is Miracu...","Boston - So lately, tongues have been wagging ...",clickbait,boston lately tongues wagging due recent story...,catholic priest dead 48 minutes miraculously r...,0.75,0,0,3,2,0,0.250905
35822,"BATMOBILE NOT STOLEN, MTV CONTEST PRIZE REMAIN...","Rajasthan, Oct 14: A man learnt a lesson, the ...",irrelevant,rajasthan oct 14 man learnt lesson hard way at...,batmobile stolen mtv contest prize remains unc...,0.0,0,0,0,1,0,0.0
11599,"Boko Haram Could Release Kidnapped Girls, Nige...",Boko Haram has reportedly agreed to a cease-fi...,debate,boko haram reportedly agreed cease fire nigeri...,boko haram could release kidnapped girls niger...,0.777778,0,0,0,1,0,0.357671
43693,Lawmaker Says ‘At Least 10′ Islamic State Figh...,Homeland Security Secretary Jeh Johnson stated...,debate,homeland security secretary jeh johnson stated...,lawmaker says ‘ least 10 ′ islamic state fight...,0.230769,0,0,1,0,0,0.068731


In [19]:
train_data_oversample.label.value_counts()

irrelevant    31045
agree         29424
debate        26727
clickbait     13440
Name: label, dtype: int64

## Trying out a couple of pipelines

In [20]:
# A few attempts led us to the following combination:

feature_dict = {
    'overlap': 'number',
    'cos_similariry': 'number',
    # 'clickbait_word_count_both': 'number', -> made it worse
    'clickbait_word_count_head': 'number',
    'clickbait_word_count_body': 'number',
    # 'negation_body': 'number', -> made it worse
    # 'negation_head': 'number', -> made it worse
    'title_preprocessed': 'text',
    'text_preprocessed': 'text',
}


In [21]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    
class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]
    
    

In [22]:
def build_pipe_and_fit(features, train_df):

    feats = []
    for feature, ftype in features.items():
        if ftype == 'text':
            feature_pipe = Pipeline([
                ('selector', TextSelector(feature)),
                ('tfidf', TfidfVectorizer(max_features=500))
            ])
        elif ftype == 'number':
            feature_pipe = Pipeline([
                ('selector', NumberSelector(feature)),
                ('standard', StandardScaler())
            ])
        feats.append((feature, feature_pipe))
    
    feats_pipe = FeatureUnion(feats)

    pipe = Pipeline([
        ("features", feats_pipe),
        ("clf", RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1))
    ])
    
    pipe.fit(train_df, train_df['label'])

    return pipe


def predict(pipe, test_df):

    y_pred = pipe.predict(test_df)
    
    accuracy = accuracy_score(test_df['label'], y_pred)
    f1 = f1_score(test_df['label'], y_pred, average="macro")
    print('Accuracy: {}'.format(accuracy))
    print('F1-score: {}'.format(f1))
    print(['agree', 'clickbait', 'debate', 'irrelevant'])
    print(f1_score(test_df['label'], y_pred, labels=['agree', 'clickbait', 'debate', 'irrelevant'], average=None))
    return


In [23]:
pipe =  build_pipe_and_fit(feature_dict, train_data_oversample)


In [24]:
predict(pipe, test_data)

Accuracy: 0.8630228623145634
F1-score: 0.5446949891830708
['agree', 'clickbait', 'debate', 'irrelevant']
[0.46090346 0.05986395 0.68741067 0.97060188]


## Final considerations

This is just a baseline to serve as an example of a possible pipeline, and there would still be space to play with hyperparameters or add more complex features. 

In particular, it might be usefull to mix in, such as:
- other preprocessing techniques
- certain types of entities
- word embeddings


