In [1]:
import json
import matplotlib.pyplot as plt
import numpy
import math
import seaborn
%matplotlib inline

plt.style.use('ggplot')

In [2]:
INSTANCES_FILE = "clickbait17-train-170331/instances.jsonl"
TRUTH_FILE     = "clickbait17-train-170331/truth.jsonl"

In [3]:
# read thr data from files
import json
instances = []
truth = []

with open(INSTANCES_FILE, "r") as inf:
    instances = [json.loads(x) for x in inf.readlines()]
with open(TRUTH_FILE, "r") as inf:
    truth = [json.loads(x) for x in inf.readlines()]

In [4]:
# compact relevant data into one list of dicts
dataset = {}

# lists: postText, targetParagraphs, targetCaptions
for i in instances:
    dataset[i['id']] = {'postText': i['postText'], 'targetTitle': i['targetTitle'],
                        'targetDescription': i['targetDescription'], 'targetKeywords': i['targetKeywords'], 
                        'targetParagraphs': i['targetParagraphs'], 'targetCaptions': i['targetCaptions']}

for t in truth:
    dataset[t['id']]['truthMean'] = t['truthMean']

# print(list(dataset.keys())[0])
# print(dataset[list(dataset.keys())[0]])

In [5]:
# get feature matrix
cb_id_list = numpy.fromiter(iter(dataset.keys()), dtype=numpy.int64)
cb_scores_list = []
cb_feat_postText = []

for i in cb_id_list:
    cb_feat_postText.append(dataset[str(i)]['postText'][0])
    cb_scores_list.append(dataset[str(i)]['truthMean'])

cb_scores = numpy.asarray(cb_scores_list)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

'''
input=u'content', strip_accents=None, ngram_range=(1, 1) -> all ngrams between (inclusive)
analyzer=u'word' -> {‘word’, ‘char’, ‘char_wb’}
preprocessor=None, tokenizer=None, stop_words=None, token_pattern=u'(?u)\b\w\w+\b', 
'''
count_vectorizer = CountVectorizer()
cb_feat_postText_word_onegram = count_vectorizer.fit_transform(cb_feat_postText)
print(cb_feat_postText_word_onegram.shape)

(2459, 7493)


In [9]:
# create features for learning
from sklearn.model_selection import train_test_split

# split x,y dataset in train/test:
x_train, x_test, y_train, y_test = train_test_split(cb_feat_postText_word_onegram, cb_scores.T, random_state=0)

In [10]:
# model evaluation
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor()
model.fit(x_train, y_train)

y_predicted = model.predict(x_test)

print("Explained variance score: " + str(explained_variance_score(y_test, y_predicted)))  # lower means: accounts less for variance in data
print("Mean squared error: " + str(mean_squared_error(y_test, y_predicted)))  # how close does the model get on average
print("Mean absolute error: " + str(mean_absolute_error(y_test, y_predicted)))  # are we biased?
print("Median absolute error: " + str(median_absolute_error(y_test, y_predicted)))  # outliers?
print("R^2: " + str(r2_score(y_test, y_predicted)))  # future predictions

score = cross_val_score(model, x_train, y_train, cv=5)
print("cross_val " + str(score.mean()) + " +- " + str(score.std()))


Explained variance score: 0.193781134777
Mean squared error: 0.0435752720845
Mean absolute error: 0.164851726352
Median absolute error: 0.140000003
R^2: 0.176820714987
cross_val 0.126930129933 +- 0.043693390432
