In [2]:
import json
import pandas as pd
import numpy as np
from hashlib import sha256
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import spacy
from spacy.matcher import Matcher
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

import nltk
nltk.download('stopwords')

from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
import string
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/diogoxavier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# load the medium-sized SpaCy model
nlp = spacy.load('en_core_web_md')

In [4]:
stopWords = set(stopwords.words('english'))

In [35]:
df_union_all = pd.read_csv('data/train_data.csv')

test = pd.read_csv('data/test_data.csv')

In [62]:
sample_df.label.value_counts()

irrelevant    1552
debate         445
agree          184
clickbait       42
Name: label, dtype: int64

In [9]:
df_union_all.label.value_counts()

irrelevant    3511
debate        1012
agree          380
clickbait       97
Name: label, dtype: int64

In [18]:
irrelevant = df_union_all['label'] == 'irrelevant'
debate = df_union_all['label'] == 'debate'
agree = df_union_all['label'] == 'agree'
clickbait = df_union_all['label'] == 'clickbait'

df_irrelevant = df_union_all[irrelevant]
df_debate = df_union_all[debate]
df_agree = df_union_all[agree]
df_clickbait = df_union_all[clickbait]

df_irrelevant = df_irrelevant[:840]
df_debate = df_debate[:840]
df_agree = df_agree[:840]
df_clickbait = df_clickbait[:840]

df_union_all= pd.concat([df_irrelevant, df_debate, df_agree, df_clickbait])


In [6]:
question_words = ['who', 'whos', 'whose', 'what', 'whats', 'whatre', 'when', 'whenre', 'whens', 'couldnt',
        'where', 'wheres', 'whered', 'why', 'whys', 'can', 'cant', 'could', 'will', 'would', 'is',
        'isnt', 'should', 'shouldnt', 'you', 'your', 'youre', 'youll', 'youd', 'here', 'heres',
        'how', 'hows', 'howd', 'this', 'are', 'arent', 'which', 'does', 'doesnt']

contractions = ['tis', 'aint', 'amnt', 'arent', 'cant', 'couldve', 'couldnt', 'couldntve',
                'didnt', 'doesnt', 'dont', 'gonna', 'gotta', 'hadnt', 'hadntve', 'hasnt',
                'havent', 'hed', 'hednt', 'hedve', 'hell', 'hes', 'hesnt', 'howd', 'howll',
                'hows', 'id', 'idnt', 'idntve', 'idve', 'ill', 'im', 'ive', 'ivent', 'isnt',
                'itd', 'itdnt', 'itdntve', 'itdve', 'itll', 'its', 'itsnt', 'mightnt',
                'mightve', 'mustnt', 'mustntve', 'mustve', 'neednt', 'oclock', 'ol', 'oughtnt',
                'shant', 'shed', 'shednt', 'shedntve', 'shedve', 'shell', 'shes', 'shouldve',
                'shouldnt', 'shouldntve', 'somebodydve', 'somebodydntve', 'somebodys',
                'someoned', 'someonednt', 'someonedntve', 'someonedve', 'someonell', 'someones',
                'somethingd', 'somethingdnt', 'somethingdntve', 'somethingdve', 'somethingll',
                'somethings', 'thatll', 'thats', 'thatd', 'thered', 'therednt', 'theredntve',
                'theredve', 'therere', 'theres', 'theyd', 'theydnt', 'theydntve', 'theydve',
                'theydvent', 'theyll', 'theyontve', 'theyre', 'theyve', 'theyvent', 'wasnt',
                'wed', 'wedve', 'wednt', 'wedntve', 'well', 'wontve', 'were', 'weve', 'werent',
                'whatd', 'whatll', 'whatre', 'whats', 'whatve', 'whens', 'whered', 'wheres',
                'whereve', 'whod', 'whodve', 'wholl', 'whore', 'whos', 'whove', 'whyd', 'whyre',
                'whys', 'wont', 'wontve', 'wouldve', 'wouldnt', 'wouldntve', 'yall', 'yalldve',
                'yalldntve', 'yallll', 'yallont', 'yallllve', 'yallre', 'yallllvent', 'yaint',
                'youd', 'youdve', 'youll', 'youre', 'yourent', 'youve', 'youvent']

In [27]:
def getCaps(df):
    df['caps'] = df['title'].str.findall(r'[A-Z]').str.len()
    return df

def getNumbers(df):
    df['numbers'] = df['title'].str.findall(r'[0.9]').str.len()
    return df

def getExclamationText(df):
    df['exclamation_text'] = df['text'].str.count('!')
    return df

def getExclamationTittle(df):    
    df['exclamation_title'] = df['title'].str.count('!')
    return df
    
def getLenText(df):
    df['text_len'] = df['text'].str.split().map(len)
    return df
    
def getLenTitle(df):
    df['title_len'] = df['title'].str.split().map(len)
    return df

def num_contract(text):
    s = text.split()
    num = len([word for word in s if word in contractions])
    return num
 
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)



In [36]:
stop = stopwords.words('english')

def process_text(text):
    result = text.replace('/', '').replace('\n', '')
    result = re.sub(r'(\w)(\1{2,})', r'\1', result)
    result = re.sub(r'(?x)\b(?=\w*\d)\w+\s*', '', result)
    result = ''.join(t for t in result if t not in string.punctuation)
    result = ' '.join([word for word in result.split() if word not in stop])
    result = re.sub(r' +', ' ', result).lower().strip()
    return result

df_union_all = getCaps(df_union_all)
df_union_all = getNumbers(df_union_all)
df_union_all = getExclamationText(df_union_all)
df_union_all = getExclamationTittle(df_union_all)
df_union_all = getLenText(df_union_all)
df_union_all = getLenTitle(df_union_all)
df_union_all['title_to_vector'] = df_union_all['title'].apply(lambda x: text_to_vector(x))
df_union_all['text_to_vector'] = df_union_all['text'].apply(lambda x: text_to_vector(x))
a = []
for idx in range(df_union_all.shape[0]):
    a.append(get_cosine(df_union_all.title_to_vector[idx], df_union_all.text_to_vector[idx]))
df_union_all['cosine'] = a
df_union_all['text'] = df_union_all['text'].apply(process_text)
df_union_all['title'] = df_union_all['title'].apply(process_text)
df_union_all['num_contract_text'] = df_union_all['text'].apply(num_contract)
df_union_all['num_contract_title'] = df_union_all['title'].apply(num_contract)

In [37]:
test = getCaps(test)
test = getNumbers(test)
test = getExclamationText(test)
test = getExclamationTittle(test)
test = getLenText(test)
test = getLenTitle(test)
test['title_to_vector'] = test['title'].apply(lambda x: text_to_vector(x))
test['text_to_vector'] = test['text'].apply(lambda x: text_to_vector(x))
a = []
for idx in range(test.shape[0]):
    a.append(get_cosine(test.title_to_vector[idx], test.text_to_vector[idx]))
test['cosine'] = a
test['text'] = test['text'].apply(process_text)
test['title'] = test['title'].apply(process_text)
test['num_contract_text'] = test['text'].apply(num_contract)
test['num_contract_title'] = test['title'].apply(num_contract)

In [None]:
test = getCaps(test)
test = getNumbers(test)
test = getExclamationText(test)
test = getExclamationTittle(test)
test = getLenText(test)
test = getLenTitle(test)
df_union_all['text'].apply(lambda x: [item for item in x if item not in stop])
df_union_all['title'].apply(lambda x: [item for item in x if item not in stop])
test['text'] = test['text'].apply(process_text)
test['title'] = test['title'].apply(process_text)
test['num_contract_text'] = test['text'].apply(num_contract)
test['num_contract_title'] = test['title'].apply(num_contract)

In [167]:
df_union_all

Unnamed: 0,title,text,label,caps,numbers,exclamation_text,exclamation_title,text_len,title_len,title_to_vector,text_to_vector,cosine,num_contract_text,num_contract_title
0,pumpkin spice condoms and other flavors that a...,everyone knows officially pumpkin spice season...,agree,9,0,0,0,278,10,"{'Pumpkin': 1, 'Spice': 1, 'Condoms': 1, 'And'...","{'Everyone': 1, 'knows': 1, 'it': 8, 's': 6, '...",0.108640,8,0
1,macaulay culkin victim another online death hoax,is macaulay culkin dead no isnt internet would...,agree,7,0,0,0,271,8,"{'Macaulay': 1, 'Culkin': 1, 'Victim': 1, 'of'...","{'Is': 1, 'Macaulay': 3, 'Culkin': 5, 'dead': ...",0.155647,2,0
2,canadian woman captured isis syria says actual...,the canadian woman reportedly kidnapped isis s...,agree,6,0,0,0,486,20,"{'Canadian': 1, 'woman': 1, 'captured': 1, 'by...","{'The': 1, 'Canadian': 3, 'woman': 1, 'who': 4...",0.421827,4,0
3,dhs rebuffs congressman’s claim isis infiltrat...,congressional republicans pushing “isis at bor...,agree,13,0,0,0,249,8,"{'DHS': 1, 'Rebuffs': 1, 'Congressman': 1, 's'...","{'Congressional': 1, 'Republicans': 1, 'have':...",0.160274,0,0
4,tropical spider burrowed mans skin appendix sc...,dylan thomas found spider burrowed way scara y...,agree,10,0,0,0,219,15,"{'Tropical': 1, 'spider': 1, 'burrowed': 1, 'u...","{'Dylan': 2, 'Thomas': 4, 'found': 2, 'the': 6...",0.241160,0,0
5,parents furious seven girls aged fall pregnant...,seven girls aged fallen pregnant going school ...,agree,1,0,0,0,392,14,"{'Parents': 1, 'are': 1, 'furious': 1, 'after'...","{'Seven': 1, 'girls': 9, 'aged': 2, 'between':...",0.326695,3,0
6,no banksy arrested sigh,elusive graffiti artist banksy’s cover blown u...,agree,2,0,0,0,131,7,"{'No': 1, 'Banksy': 1, 'has': 1, 'not': 1, 'be...","{'Elusive': 1, 'graffiti': 1, 'artist': 2, 'Ba...",0.164717,0,0
7,is beheads briton david cawthorne haines threa...,uk prime minister cameron confirmed haines dea...,agree,13,0,0,0,19,13,"{'IS': 1, 'Beheads': 1, 'Briton': 2, 'David': ...","{'UK': 2, 'Prime': 1, 'Minister': 1, 'Cameron'...",0.107676,0,0
8,staff reporter,check photo shows chinese workers used manipul...,agree,2,0,0,0,248,2,"{'Staff': 1, 'Reporter': 1}","{'Check': 1, 'out': 2, 'this': 2, 'photo': 5, ...",0.000000,1,0
9,an indian civil servant just got sacked after ...,reuters even india government jobs considered ...,agree,15,0,0,0,202,16,"{'An': 1, 'Indian': 1, 'Civil': 1, 'Servant': ...","{'Reuters': 1, 'Even': 2, 'in': 8, 'India': 2,...",0.000000,0,0


In [151]:
split_text = df_union_all['text'].str.split()
split_title = df_union_all['title'].str.split()

common_word_text = []
for word in split_text:
    counter = Counter(word)
    most_occur = counter.most_common(5)
    common_word_text.append(most_occur)

common_word_title = []
for word in split_title:
    counter = Counter(word)
    most_occur = counter.most_common(5)
    common_word_title.append(most_occur)



22235

In [153]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)



In [78]:
docs_text = list(nlp.pipe(df_union_all['text']))
docs_title = list(nlp.pipe(df_union_all['title']))

matcher = Matcher(nlp.vocab)
pattern = pattern = [{'POS': 'NOUN'}]

matcher.add("nouns", None, pattern)
#
nouns_counts_text = []
for doc in docs_text:
    matches = matcher(doc)
    nouns_counts_text.append(len(matches))
    
nouns_counts_title = []
for doc in docs_title:
    matches = matcher(doc)
    nouns_counts_title.append(len(matches))

In [67]:
df_union_all['nouns_counts_text'] = nouns_counts_text
df_union_all['nouns_counts_title'] = nouns_counts_title

In [59]:
docs_text = list(nlp.pipe(test['text']))
docs_title = list(nlp.pipe(test['title']))

matcher = Matcher(nlp.vocab)
pattern = pattern = [{'POS': 'NOUN'}]

matcher.add("nouns", None, pattern)
#
nouns_counts_text = []
for doc in docs_text:
    matches = matcher(doc)
    nouns_counts_text.append(len(matches))
    
test['nouns_counts_text'] = nouns_counts_text

KeyboardInterrupt: 

In [38]:
train_data, test_data = train_test_split(df_union_all, test_size=0.2, random_state=42)

In [12]:
train_data

Unnamed: 0,title,text,label,caps,numbers,exclamation_text,exclamation_title,text_len,title_len,title_to_vector,text_to_vector,cosine,num_contract_text,num_contract_title
13494,report a radically redesigned inch macbook air...,reuters a canadian soldier shot canadian war m...,irrelevant,10,0,0,0,80,9,"{'Report': 1, 'A': 1, 'Radically': 1, 'Redesig...","{'Reuters': 1, 'A': 1, 'Canadian': 3, 'soldier...",0.029748,0,0
9124,unverified video shows beheading aid worker da...,the racket report – kfc gets occupational busi...,irrelevant,3,0,0,0,264,9,"{'Unverified': 1, 'video': 1, 'shows': 1, 'beh...","{'The': 2, 'Racket': 1, 'Report': 1, 'KFC': 6,...",0.082342,1,0
1586,homeland security no isil fighters us border,the department homeland security definitively ...,agree,9,2,0,0,433,8,"{'Homeland': 1, 'Security': 1, 'No': 1, 'ISIL'...","{'The': 4, 'Department': 2, 'of': 4, 'Homeland...",0.228409,1,0
6,met police denies reports banksy arrest,the rumour banksy arrested circulating interne...,agree,2,0,0,0,436,7,"{'Met': 1, 'police': 1, 'denies': 1, 'reports'...","{'The': 8, 'rumour': 2, 'that': 5, 'Banksy': 1...",0.193075,6,0
2743,who investigates media reports isis fighters c...,baghdad morning tariq alali mowaffak revealed ...,debate,8,0,0,0,171,8,"{'WHO': 1, 'investigates': 1, 'media': 1, 'rep...","{'BAGHDAD': 1, 'morning': 2, 'Tariq': 1, 'al':...",0.013910,3,0
14480,us confirms death somalia terror group leader,share seven year old superhero abilities nick ...,irrelevant,3,0,0,0,281,8,"{'US': 1, 'confirms': 1, 'death': 1, 'of': 1, ...","{'Share': 1, 'this': 4, 'Seven': 1, 'year': 1,...",0.058471,0,0
11276,watch us marine get shot head taliban sniper l...,a married tv actor met young woman kissed saun...,irrelevant,9,0,0,0,193,16,"{'Watch': 1, 'US': 1, 'Marine': 1, 'get': 1, '...","{'A': 2, 'married': 1, 'TV': 1, 'actor': 2, 'w...",0.164251,0,0
12366,accused boston marathon bomber severely injure...,a yearold canadianisraeli woman traveled iraq ...,irrelevant,14,0,1,0,257,14,"{'Accused': 1, 'Boston': 1, 'Marathon': 1, 'Bo...","{'A': 1, '31': 2, 'year': 1, 'old': 1, 'Canadi...",0.000000,0,0
19403,kim jongun relying ‘cobra wine’ problems trous...,a picture letter parents box lego going viral ...,irrelevant,2,0,1,0,206,11,"{'Kim': 1, 'Jong': 1, 'un': 1, 'relying': 1, '...","{'A': 5, 'picture': 1, 'of': 5, 'a': 7, 'lette...",0.050932,0,0
18809,isis pays tribute cub baghdadi child soldier,apple took leap wearable space announcement ap...,irrelevant,6,0,1,0,386,10,"{'ISIS': 1, 'pays': 1, 'tribute': 1, 'to': 1, ...","{'Apple': 16, 'took': 1, 'the': 32, 'leap': 1,...",0.149417,0,0


In [16]:
# Custom transformer to implement sentence cleaning
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stemmer, regex_list,
                 lower=True, remove_punct=True):
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.regex_list = regex_list
        self.lower = lower
        self.remove_punct = remove_punct
        
    def transform(self, X, *_):
        X = list(map(self._clean_sentence, X))
        return X
    
    def _clean_sentence(self, sentence):
        
        # Replace given regexes
        for regex in self.regex_list:
            sentence = re.sub(regex[0], regex[1], sentence)
            
        # lowercase
        if self.lower:
            sentence = sentence.lower()

        # Split sentence into list of words
        words = self.tokenizer.tokenize(sentence)
            
        # Remove punctuation
        if self.remove_punct:
            words = list(filter(lambda x: x not in string.punctuation, words))

        # Stem words
        if self.stemmer:
            words = map(self.stemmer.stem, words)

        # Join list elements into string
        sentence = " ".join(words)
        
        return sentence
    
    def fit(self, *_):
        return self

In [17]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]
    
def get_f1_score(feats, train_data, test_data):
    """
    Return the accuracy on the test_data by using a RandomForestClassifier trained on the 
    train_data with the features described by feats
    """

    pipeline = Pipeline([
        ('features',feats),
        ('classifier', RandomForestClassifier(random_state = 42, n_estimators=10)),
    ])

    pipeline.fit(train_data, train_data.label)

    f1 = f1_score(test_data.label, pipeline.predict(test_data), average='macro')
    
    print("F1_score: {:.4f}".format(f1))
    
    return f1

def get_predictions(feats, train_data, test_data, test):
    """
    Return the accuracy on the test_data by using a RandomForestClassifier trained on the 
    train_data with the features described by feats
    """

    pipeline = Pipeline([
        ('features',feats),
        ('classifier', RandomForestClassifier(random_state = 42, n_estimators=10)),
    ])

    pipeline.fit(train_data, train_data.label)

    y_test = pipeline.predict(test)
    
    return y_test   

In [None]:
tokenizer = WordPunctTokenizer()
stemmer = SnowballStemmer("english", ignore_stopwords=True)
regex_list = [("<[^>]*>", "")]

text = Pipeline([
                ('selector', TextSelector("text")),
                ('transformer',TextCleanerTransformer(tokenizer, stemmer, regex_list)),
                ('tfidf', TfidfVectorizer(ngram_range=(1, 2),stop_words='english'))
            ])

title = Pipeline([
                ('selector', TextSelector("title")),
                ('transformer',TextCleanerTransformer(tokenizer, stemmer, regex_list)),
                ('tfidf', TfidfVectorizer(ngram_range=(1, 2),stop_words='english'))
            ])

numbers = Pipeline([
                ('selector', NumberSelector("numbers")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

caps = Pipeline([
                ('selector', NumberSelector("caps")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

excl_text = Pipeline([
                ('selector', NumberSelector("exclamation_text")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

excl_title = Pipeline([
                ('selector', NumberSelector("exclamation_title")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

text_len = Pipeline([
                ('selector', NumberSelector("text_len")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

title_len = Pipeline([
                ('selector', NumberSelector("title_len")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

title_contractions = Pipeline([
                ('selector', NumberSelector("num_contract_title")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

text_contractions = Pipeline([
                ('selector', NumberSelector("num_contract_text")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

nouns_text = Pipeline([
                ('selector', NumberSelector("nouns_counts_text")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

nouns_title = Pipeline([
                ('selector', NumberSelector("nouns_counts_title")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

cosine = Pipeline([
                ('selector', NumberSelector("cosine")),
                ('tfidf', StandardScaler(with_std=False, with_mean=False))
            ])

feats = FeatureUnion([('text', text),
                      ('title', title),
                      ('caps', caps),
                      ('text_contractions', text_contractions),
                      ('cosine',cosine)])

teste = get_f1_score(feats, train_data, test_data)
#F1_score_1: 0.5473
#F1_score_2: 0.5572
#F1_score: 0.5738 - sem caps
#F1_score: 0.5844 - sem exclamation no dois, texto e titulo com caps e lens.
#F1_score: 0.5616 - com exclamation no titulo
#F1_score: 0.5469 - ngrams 1,3

#F1_score: 0.4971
#F1_score: 0.4924
#F1_score: 0.6023
#F1_score: 0.6530

In [None]:
y_test = get_predictions(feats, train_data, test_data, test)

In [None]:
df_submission = pd.DataFrame(y_test)
df_submission.columns = ['label']
df_submission.index.name = 'News ID'

In [None]:
df_submission.to_csv('submission_5.csv', index=True, header=True)

In [86]:
!ls

data  environment.yml  README.md  submission_1.csv  Untitled.ipynb
