In [103]:
import json
import matplotlib.pyplot as plt
import numpy
import math
import seaborn
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

plt.style.use('ggplot')

In [2]:
instances_file_train = "clickbait17-train-170331/instances.jsonl"
truth_file_train     = "clickbait17-train-170331/truth.jsonl"

In [3]:
# read thr data from files
import json
instances = []
truth = []

with open(instances_file_train, "r") as inf:
    instances = [json.loads(x) for x in inf.readlines()]
with open(truth_file_train, "r") as inf:
    truth = [json.loads(x) for x in inf.readlines()]

In [4]:
# compact relevant data into one list of dicts
dataset = {}

# lists: postText, targetParagraphs, targetCaptions
for i in instances:
    dataset[i['id']] = {'postText': i['postText'], 'targetTitle': i['targetTitle'],
                        'targetDescription': i['targetDescription'], 'targetKeywords': i['targetKeywords'], 
                        'targetParagraphs': i['targetParagraphs'], 'targetCaptions': i['targetCaptions']}

for t in truth:
    dataset[t['id']]['truthMean'] = t['truthMean']

# print(list(dataset.keys())[0])
# print(dataset[list(dataset.keys())[0]])

In [5]:
# get list of scores and a list of the postTexts
cb_id_list = numpy.fromiter(iter(dataset.keys()), dtype=numpy.int64)
cb_scores_list = []
cb_feat_postText = []

for i in cb_id_list:
    cb_feat_postText.append(dataset[str(i)]['postText'][0])
    cb_scores_list.append(dataset[str(i)]['truthMean'])

cb_scores = numpy.asarray(cb_scores_list)

In [42]:
from nltk import download
download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [91]:
# sanitize text with nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
import string

lemmatizer = WordNetLemmatizer()
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
punctuation = string.punctuation

def preprocess_tokenize(text):
    for token in tokenize(text):
        token = preprocess(token)
        yield token

def tokenize(text):
    return [token for token in tknzr.tokenize(text)] # if token not in string.punctuation]

def pos_tokenize(text):
    token_list = [[token] for token in tknzr.tokenize(text) if token not in string.punctuation]
    return token_list
            
def preprocess(string):
    string = string.lower()
    string = lemmatizer.lemmatize(string)
    if string.endswith("'s"):
        string = string[:-2]
    return string

__More possible features:__
* avarage text/word length in characters per Tweet
* known entitiy count
* noun frequency
* POS Tag frequency
* stop word frequency
* punctuation frequency
* tweet sentiment
* you word list

__word 1-grams__

In [104]:
# POS Tag frequencies
from nltk.tag import pos_tag_sents

all_pos_tags = [pos_tag_sents(pos_tokenize(tokens)) for tokens in cb_feat_postText]
tag_list_of_lists = []
for tweets in all_pos_tags:
    tag_list_of_lists.append([elements[0][1] for elements in tweets])

pos_tag_cv = CountVectorizer()
cb_feat_pos_frequencies = pos_tag_cv.fit_transform(tag_list_of_lists)
print(cb_feat_pos_frequencies.shape) # 2459, 8095

AttributeError: 'list' object has no attribute 'lower'

In [41]:
# TODO NER on tweets is not trivial, dont use this feature jet
from nltk.tag import StanfordNERTagger

st = StanfordNERTagger('/home/mike4537/opt/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/home/mike4537/opt/stanford-ner/stanford-ner.jar',
                       encoding='utf-8')

text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

tokenized_text = tokenize(cb_feat_postText[0])
classified_text = st.tag(tokenized_text)

print(classified_text)

[('Foo', 'O'), ('Fighters', 'O'), ('frontman', 'O'), ('Dave', 'PERSON'), ('Grohl', 'PERSON'), ('falls', 'O'), ('off', 'O'), ('stage', 'O'), (',', 'O'), ('breaks', 'O'), ('leg', 'O'), (',', 'O'), ('finishes', 'O'), ('gig', 'O')]


In [7]:
# Word 1-grams matrix
'''
input=u'content', strip_accents=None, ngram_range=(1, 1) -> all ngrams between (inclusive)
analyzer=u'word' -> {‘word’, ‘char’, ‘char_wb’}
preprocessor=None, tokenizer=None, stop_words=None, token_pattern=u'(?u)\b\w\w+\b', 
'''
# there is a difference in feature dimension(100) if i use my tokenizer vs. the build in one??????
count_vectorizer = CountVectorizer(preprocessor=preprocess, tokenizer=tokenize)
cb_feat_postText_word_onegram = count_vectorizer.fit_transform(cb_feat_postText)
print(cb_feat_postText_word_onegram.shape) # 2459, 8095

(2459, 7405)


__tweet length__

In [8]:
cb_feat_wordLength = []
for tweet in cb_feat_postText:
    cb_feat_wordLength.append(len([x for x in preprocess_tokenize(tweet)]))

__known entity count__

In [12]:
from nltk import ne_chunk, pos_tag
from nltk.chunk import tree2conlltags

# how does this work ????

for tweet in cb_feat_postText:
    chunk_tree = ne_chunk(pos_tag([x for x in preprocess_tokenize(tweet)]))
    iob = tree2conlltags(chunk_tree)
    for tag in iob:
        if tag[2] != 'O':
            print(tag)


('#clinton', 'NNP', 'B-GPE')
('#mosul', 'JJ', 'B-GPE')
('#baiji', 'NNP', 'B-GPE')


In [32]:
for tweet in cb_feat_postText:
    tags = pos_tag([x for x in preprocess_tokenize(tweet)])
    print(tags)


[('apple', 'NN'), ("'s", 'POS'), ('io', 'JJ'), ('9', 'CD'), ("'app", 'POS'), ('thinning', 'VBG'), ('feature', 'NN'), ('will', 'MD'), ('give', 'VB'), ('your', 'PRP$'), ('phone', 'NN'), ("'s", 'POS'), ('storage', 'NN'), ('a', 'DT'), ('boost', 'NN')]
[('rt', 'NN'), ('kenbrown12', 'VBZ'), ('emerging', 'VBG'), ('market', 'NN'), ('investor', 'NN'), ('are', 'VBP'), ('doing', 'VBG'), ('their', 'PRP$'), ('best', 'JJS'), ('monty', 'NN'), ('python', 'NN'), ('--', ':'), ("''", "''"), ('run', 'VB'), ('away', 'RB'), ('run', 'VB'), ('away', 'RP'), ("''", "''")]
[('u.s.', 'JJ'), ('soccer', 'NN'), ('should', 'MD'), ('start', 'VB'), ('answering', 'VBG'), ('tough', 'JJ'), ('question', 'NN'), ('about', 'IN'), ('hope', 'NN'), ('solo', 'NN'), ('eric_adelson', 'NN'), ('writes', 'NNS')]
[('how', 'WRB'), ('theme', 'JJ'), ('park', 'NN'), ('like', 'IN'), ('disney', 'NN'), ('world', 'NN'), ('left', 'VBD'), ('the', 'DT'), ('middle', 'JJ'), ('class', 'NN'), ('behind', 'IN')]
[('13', 'CD'), ('classic', 'JJ'), ('’00s

[('rt', 'NN'), ('cnnbrk', 'JJ'), ('germanwings', 'NNS'), ('co-pilot', 'JJ'), ('andreas', 'NNS'), ('lubitz', 'VBP'), ('visited', 'VBN'), ('seven', 'CD'), ('doctor', 'NN'), ('the', 'DT'), ('month', 'NN'), ('before', 'IN'), ('the', 'DT'), ('crash', 'NN'), ('he', 'PRP'), ('feared', 'VBD'), ('he', 'PRP'), ('wa', 'VBZ'), ('going', 'VBG'), ('blind', 'IN'), ('http', 'JJ'), ('//t.co', 'NN')]
[('gun', 'NN'), ('killing', 'VBG'), ('fell', 'VBD'), ('by', 'IN'), ('40', 'CD'), ('percent', 'NN'), ('after', 'IN'), ('connecticut', 'NN'), ('passed', 'VBD'), ('this', 'DT'), ('law', 'NN')]
[('how', 'WRB'), ('to', 'TO'), ('master', 'VB'), ('the', 'DT'), ('genius', 'NN'), ('bar', 'NN'), ('to', 'TO'), ('get', 'VB'), ('the', 'DT'), ('most', 'RBS'), ('out', 'IN'), ('of', 'IN'), ('your', 'PRP$'), ('trip', 'NN'), ('to', 'TO'), ('the', 'DT'), ('apple', 'NN'), ('store', 'NN')]
[('sony', 'NN'), ('music', 'NN'), ('ceo', 'NN'), ('confirms', 'VBZ'), ('apple', 'NN'), ("'s", 'POS'), ('streaming', 'JJ'), ('service', 'NN')

[('snoop', 'NN'), ('dogg', 'NN'), ('ha', 'NN'), ('volunteered', 'VBD'), ('to', 'TO'), ('lead', 'VB'), ('twitter', 'NN'), ('here', 'RB'), ("'s", 'POS'), ('snoop', 'NN'), ('and', 'CC'), ('everyone', 'NN'), ('else', 'RB'), ('who', 'WP'), ('could', 'MD'), ('run', 'VB'), ('it', 'PRP')]
[('leaked', 'VBN'), ('image', 'NN'), ('reveal', 'NN'), ('nest', 'JJS'), ("'s", 'POS'), ('plan', 'NN'), ('to', 'TO'), ('fully', 'RB'), ('absorb', 'VB'), ('dropcam', 'NN')]
[('nra', 'JJ'), ('watchdog', 'NN'), ('group', 'NN'), ('spar', 'VBD'), ('over', 'IN'), ('claim', 'NN'), ('that', 'IN'), ('gun-rights', 'NNS'), ('association', 'NN'), ('violated', 'VBD'), ('campaign', 'NN'), ('finance', 'NN'), ('and', 'CC'), ('tax', 'NN'), ('law', 'NN'), ('by', 'IN'), ('isikoff', 'NN')]
[('rise', 'NN'), ('in', 'IN'), ('new', 'JJ'), ('case', 'NN'), ('show', 'NN'), ('ebola', 'VBZ'), ('ha', 'VBZ'), ('not', 'RB'), ('released', 'VBN'), ('it', 'PRP'), ('deadly', 'RB'), ('grip', 'VBD')]
[('rt', 'NN'), ('mikedebonis', 'NN'), ('dennis'

[('germany', 'JJ'), ('drop', 'NN'), ('inquiry', 'NN'), ('into', 'IN'), ('claim', 'NN'), ('u.s', 'NN'), ('tapped', 'VBD'), ('merkel’s', 'CD'), ('phone', 'NN')]
[('ronaldo', 'NN'), ("'s", 'POS'), ('hat', 'WP'), ('trick', 'JJ'), ('wa', 'NN'), ('nice', 'JJ'), ('but', 'CC'), ('did', 'VBD'), ('he', 'PRP'), ('do', 'VB'), ('it', 'PRP'), ('all', 'DT'), ('in', 'IN'), ('the', 'DT'), ('final', 'JJ'), ('4', 'CD'), ('minute', 'NN'), ('poland', 'NN'), ("'s", 'POS'), ('robert', 'JJ'), ('lewandowski', 'NN'), ('did', 'VBD'), ('watch', 'VB')]
[('jeb', 'NN'), ('bush', 'NN'), ('plan', 'NN'), ('tough', 'JJ'), ('talk', 'NN'), ('on', 'IN'), ('putin', 'NN'), ('during', 'IN'), ('europe', 'NN'), ('trip', 'NN')]
[('video', 'NN'), ('little', 'JJ'), ('kid', 'NN'), ('break', 'VB'), ('his', 'PRP$'), ('opponent', 'NN'), ('ankle', 'NN'), ('with', 'IN'), ('a', 'DT'), ('nasty', 'JJ'), ('crossover', 'NN'), ('thefuture', 'NN')]
[('this', 'DT'), ('rugby', 'NN'), ('player', 'NN'), ('broke', 'VBD'), ('her', 'PRP$'), ('nose', 

[('what', 'WP'), ('nyt', 'JJ'), ('journalist', 'NN'), ('recommend', 'VBP'), ('reading', 'VBG'), ('from', 'IN'), ('around', 'IN'), ('the', 'DT'), ('web', 'NN')]
[('here', 'RB'), ('come', 'VBN'), ('almost', 'RB'), ('free', 'JJ'), ('money', 'NN')]
[('argentinian', 'JJ'), ('businessman', 'NN'), ('alejandro', 'NN'), ('burzaco', 'NN'), ('arrested', 'VBN'), ('in', 'IN'), ('italy', 'NN'), ('over', 'IN'), ('fifa', 'NN'), ('scandal', 'NN')]
[('rt', 'NN'), ('wsjny', 'NN'), ('district', 'NN'), ('attorney', 'NN'), ('say', 'VBP'), ('prison', 'NN'), ('worker', 'NN'), ('may', 'MD'), ('have', 'VB'), ('provided', 'VBN'), ('contraband', 'NN'), ('to', 'TO'), ('escapee', 'VB')]
[('rt', 'NN'), ('nytimeswell', 'NN'), ('dating', 'VBG'), ('with', 'IN'), ('schizophrenia', 'NN')]
[('rt', 'NN'), ('buzzfeedceleb', 'VBZ'), ('a', 'DT'), ('full', 'JJ'), ('history', 'NN'), ('of', 'IN'), ('taylor', 'NN'), ('swift’s', 'NN'), ('alleged', 'VBD'), ('feud', 'NN'), ('with', 'IN'), ('katy', 'NN'), ('perry', 'NN')]
[('rt', 'NN

[('7', 'CD'), ('coworking', 'VBG'), ('space', 'NN'), ('with', 'IN'), ('childcare', 'NN'), ('across', 'IN'), ('the', 'DT'), ('u.s', 'NN')]
[('rt', 'NN'), ('nbcsports', 'NNS'), ('andre', 'VBP'), ('iguodala', 'NN'), ('and', 'CC'), ('the', 'DT'), ('warrior', 'NN'), ('have', 'VBP'), ('evened', 'VBN'), ('the', 'DT'), ('series', 'NN'), ('with', 'IN'), ('the', 'DT'), ('cavs', 'NN'), ('a', 'DT'), ('we', 'PRP'), ('now', 'RB'), ('head', 'VBP'), ('to', 'TO'), ('oakland', 'VB'), ('for', 'IN'), ('game', 'NN'), ('5', 'CD')]
[('rt', 'NN'), ('nytimesphoto', 'JJ'), ('photo', 'NN'), ('of', 'IN'), ('the', 'DT'), ('day', 'NN')]
[('this', 'DT'), ('batman-themed', 'JJ'), ('family', 'NN'), ('photo', 'NN'), ('shoot', 'NN'), ('will', 'MD'), ('make', 'VB'), ('every', 'DT'), ('superhero', 'NN'), ('fan', 'NN'), ('want', 'VBP'), ('to', 'TO'), ('have', 'VB'), ('a', 'DT'), ('baby', 'NN')]
[('charge', 'NN'), ('dropped', 'VBD'), ('against', 'IN'), ('family', 'NN'), ('that', 'WDT'), ('cheered', 'VBD'), ('too', 'RB'), ('

[('apple', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('important', 'JJ'), ('service', 'NN'), ('is', 'VBZ'), ('about', 'IN'), ('to', 'TO'), ('get', 'VB'), ('so', 'RB'), ('much', 'JJ'), ('better', 'JJR')]
[('rt', 'NN'), ('nytimesworld', 'NN'), ('south', 'NN'), ('korea', 'JJ'), ('ha', 'NN'), ('become', 'VBD'), ('the', 'DT'), ('worst-afflicted', 'JJ'), ('country', 'NN'), ('besides', 'IN'), ('saudi', 'JJ'), ('arabia', 'NN'), ('with', 'IN'), ('mers', 'NNS')]
[('prison', 'NN'), ('worker', 'NN'), ('joyce', 'NN'), ('mitchell', 'NN'), ('did', 'VBD'), ('not', 'RB'), ('give', 'VB'), ('inmate', 'NN'), ('power', 'NN'), ('tool', 'NN'), ('prosecutor', 'NN'), ('say', 'VBP')]
[('footballer', 'NN'), ('dy', 'NN'), ('after', 'IN'), ('collapsing', 'VBG'), ('on', 'IN'), ('pitch', 'NN'), ('during', 'IN'), ('match', 'NN')]
[('lethalweapon', 'NN'), ('and', 'CC'), ('thegoonies', 'NNS'), ('actress', 'VBP'), ('mary', 'JJ'), ('ellen', 'NNS'), ('trainor', 'VBP'), ('ha', 'NN'), ('died', 'VBD'), ('at', 'IN'), ('age', 'NN

[('snoop', 'NN'), ('dogg', 'VBZ'), ('the', 'DT'), ('next', 'JJ'), ('ceo', 'NN'), ('of', 'IN'), ('twitter', 'NN'), ('snoopforceo', 'NN')]
[('road', 'NN'), ('closed', 'VBD'), ('3', 'CD'), ('mile', 'NN'), ('from', 'IN'), ('ny', 'JJ'), ('prison', 'NN'), ('a', 'DT'), ('cop', 'NN'), ('chase', 'JJ'), ('lead', 'NN'), ('on', 'IN'), ('escapee', 'JJ'), ('richard', 'NN'), ('matt', 'NN'), ('and', 'CC'), ('david', 'JJ'), ('sweat', 'NN')]
[('here', 'RB'), ("'s", 'VBZ'), ('why', 'WRB'), ('these', 'DT'), ('ikea', 'JJ'), ('item', 'NN'), ('are', 'VBP'), ('cheaper', 'JJR'), ('now', 'RB'), ('than', 'IN'), ('they', 'PRP'), ('were', 'VBD'), ('30', 'CD'), ('year', 'NN'), ('ago', 'RB'), ('hint', 'NN'), ('cardboard', 'NN')]
[('rt', 'NN'), ('mashabletech', 'NN'), ('former', 'JJ'), ('yahoo', 'NN'), ('exec', 'NN'), ('anandc', 'NN'), ('is', 'VBZ'), ('joining', 'VBG'), ('snapdeal', 'VB'), ('a', 'DT'), ('billion-dollar', 'JJ'), ('indian', 'JJ'), ('marketplace', 'NN')]
[('video', 'NN'), ('cavs', 'NN'), ('fan', 'NN'), 

__Learn from the extracted features from here on__

In [10]:
from sklearn.model_selection import train_test_split

# split x,y dataset in train/test:
x_train, x_test, y_train, y_test = train_test_split(cb_feat_postText_word_onegram, cb_scores.T, random_state=42)

In [11]:
# model evaluation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor()
model.fit(x_train, y_train)

y_predicted = model.predict(x_test)

# NOTE: mean square is bullshit if values are < 1
# cross entropy für differenz zwischen verteilungen 

print("Explained variance score: " + str(explained_variance_score(y_test, y_predicted)))  # lower means: accounts less for variance in data
print("Mean squared error: " + str(mean_squared_error(y_test, y_predicted)))  # how close does the model get on average
print("Mean absolute error: " + str(mean_absolute_error(y_test, y_predicted)))  # are we biased?
print("Median absolute error: " + str(median_absolute_error(y_test, y_predicted)))  # outliers?
print("R^2: " + str(r2_score(y_test, y_predicted)))  # future predictions

score = cross_val_score(model, x_train, y_train, cv=5)
print("cross_val " + str(score.mean()) + " +- " + str(score.std()))


Explained variance score: 0.227639273722
Mean squared error: 0.0436134462009
Mean absolute error: 0.166481485082
Median absolute error: 0.146666662
R^2: 0.207216572414
cross_val 0.16211296821 +- 0.0364715172917
