In [2]:
import json
import matplotlib.pyplot as plt
import numpy
import math
import seaborn
%matplotlib inline

plt.style.use('ggplot')

In [3]:
INSTANCES_FILE = "clickbait17-train-170331/instances.jsonl"
TRUTH_FILE     = "clickbait17-train-170331/truth.jsonl"

In [4]:
# read thr data from files
import json
instances = []
truth = []

with open(INSTANCES_FILE, "r") as inf:
    instances = [json.loads(x) for x in inf.readlines()]
with open(TRUTH_FILE, "r") as inf:
    truth = [json.loads(x) for x in inf.readlines()]

In [5]:
# compact relevant data into one list of dicts
dataset = {}

# lists: postText, targetParagraphs, targetCaptions
for i in instances:
    dataset[i['id']] = {'postText': i['postText'], 'targetTitle': i['targetTitle'],
                        'targetDescription': i['targetDescription'], 'targetKeywords': i['targetKeywords'], 
                        'targetParagraphs': i['targetParagraphs'], 'targetCaptions': i['targetCaptions']}

for t in truth:
    dataset[t['id']]['truthMean'] = t['truthMean']

# print(list(dataset.keys())[0])
# print(dataset[list(dataset.keys())[0]])

In [6]:
# get list of scores and a list of the postTexts
cb_id_list = numpy.fromiter(iter(dataset.keys()), dtype=numpy.int64)
cb_scores_list = []
cb_feat_postText = []

for i in cb_id_list:
    cb_feat_postText.append(dataset[str(i)]['postText'][0])
    cb_scores_list.append(dataset[str(i)]['truthMean'])

cb_scores = numpy.asarray(cb_scores_list)

In [8]:
from nltk import download
download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [30]:
# sanitize text with nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
import string

lemmatizer = WordNetLemmatizer()
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
punctuation = string.punctuation

def preprocess_tokenize(text):
    for token in tokenize(text):
        token = preprocess(token)
        yield token

def tokenize(text):
    return [token for token in tknzr.tokenize(text)] # if token not in string.punctuation]
            
def preprocess(string):
    string = string.lower()
    string = lemmatizer.lemmatize(string)
    if string.endswith("'s"):
        string = string[:-2]
    return string

__More possible features:__
* avarage text/word length in characters per Tweet
* known entitiy count
* noun frequency
* POS Tag frequency
* stop word frequency
* punctuation frequency
* tweet sentiment
* you word list

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

'''
input=u'content', strip_accents=None, ngram_range=(1, 1) -> all ngrams between (inclusive)
analyzer=u'word' -> {‘word’, ‘char’, ‘char_wb’}
preprocessor=None, tokenizer=None, stop_words=None, token_pattern=u'(?u)\b\w\w+\b', 
'''
# there is a difference in feature dimension(100) if i use my tokenizer vs. the build in one??????
count_vectorizer = CountVectorizer(preprocessor=preprocess, tokenizer=tokenize)
cb_feat_postText_word_onegram = count_vectorizer.fit_transform(cb_feat_postText)
print(cb_feat_postText_word_onegram.shape) # 2459, 8095

(2459, 7405)


__Learn from the extracted features from here on__

In [27]:
from sklearn.model_selection import train_test_split

# split x,y dataset in train/test:
x_train, x_test, y_train, y_test = train_test_split(cb_feat_postText_word_onegram, cb_scores.T, random_state=42)

In [28]:
# model evaluation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor()
model.fit(x_train, y_train)

y_predicted = model.predict(x_test)

# NOTE: mean square is bullshit if values are < 1

print("Explained variance score: " + str(explained_variance_score(y_test, y_predicted)))  # lower means: accounts less for variance in data
print("Mean squared error: " + str(mean_squared_error(y_test, y_predicted)))  # how close does the model get on average
print("Mean absolute error: " + str(mean_absolute_error(y_test, y_predicted)))  # are we biased?
print("Median absolute error: " + str(median_absolute_error(y_test, y_predicted)))  # outliers?
print("R^2: " + str(r2_score(y_test, y_predicted)))  # future predictions

score = cross_val_score(model, x_train, y_train, cv=5)
print("cross_val " + str(score.mean()) + " +- " + str(score.std()))


Explained variance score: 0.234685725047
Mean squared error: 0.0430289501807
Mean absolute error: 0.166686182348
Median absolute error: 0.140000002
R^2: 0.217841248945
cross_val 0.193093162876 +- 0.0318171250821
