In [1]:
import json
import numpy as np

path = "../tweet_data/"

files = ["tweets_#gohawks.txt", "tweets_#gopatriots.txt", \
        "tweets_#nfl.txt", "tweets_#patriots.txt", \
        "tweets_#sb49.txt", "tweets_#superbowl.txt"]

for i, fl in enumerate(files):
    print "files[" + str(i) + "] => " + fl

files[0] => tweets_#gohawks.txt
files[1] => tweets_#gopatriots.txt
files[2] => tweets_#nfl.txt
files[3] => tweets_#patriots.txt
files[4] => tweets_#sb49.txt
files[5] => tweets_#superbowl.txt


In [16]:
# transform the 2015 Feb. 1, 8:00 am and 8:00 pm into timestamp form
import datetime, time
import pytz

start_time = time.mktime(time.strptime("2015-02-01 08:00:00",'%Y-%m-%d %H:%M:%S'))
end_time = time.mktime(time.strptime("2015-02-01 20:00:00",'%Y-%m-%d %H:%M:%S'))
# used as the zero point of the time
base_time = time.mktime(time.strptime("2015-01-01 00:00:00",'%Y-%m-%d %H:%M:%S'))

In [None]:
def floor_date(date):
    return date // 3600

In [24]:
# the extracted data contains
# 0. the citation date 1. the favourite_count 2. the followers number
# 3. the length of the title 4. the number of the twitters
def load_file(file):
    '''
    load the time, follows, and retweets
    '''
    # 0. before Feb. 1, 8:00 a.m. 1. between 2. after 8:00 p.m.
    result = [[], [], []]
    for line in open(path + file, 'r') :
        tmp = []
        a = json.loads(line)
        citation_date = a['citation_date']
        tmp.append(int((citation_date - base_time) // 3600))
        tmp.append(a['author']['followers'])
        tmp.append(a['tweet']['user']['favourites_count'])
        tmp.append(len(a['title']))
        tmp.append(1)
        if citation_date < start_time:
            result[0].append(tmp)
        elif citation_date < end_time:
            result[1].append(tmp)
        else:
            result[2].append(tmp)
    return result

In [25]:
gohawks = load_file(files[0])

In [26]:
print gohawks[0][0]

[394, 1752.0, 9490, 52, 1]


In [70]:
def group_data(dataset):
    '''
    parameter dataset is one of the three time slot defined above, namely, data[0], data[1], data[2]
    count all the features in 1-hour slot
    
    return the train feature (train_set[0:4]) and the result (train_set[5])
    '''
    # get the maximum and minimum time
    max_time = dataset[0][0]
    min_time = dataset[0][0]
    for p in dataset:
        time = p[0]
        if max_time < time:
            max_time = time
        if min_time > time:
            min_time = time
    
    train_set = []
    
    for i in range(min_time, max_time + 1):
        train_set.append([0] * 5)
    
    for p in dataset:
        index = p[0] - min_time
        train_set[index][0] = p[0]
        for i in range(1, 5):
            train_set[index][i] += p[i]
    
    result = []
    for i in range(0, max_time - min_time):
        result.append(train_set[i + 1][4])
    
    return train_set[:-1], result

In [75]:
X, y = group_data(gohawks[0])

In [78]:
def get_error(predict, target):
    '''
        used to get the average error between predict and target
    '''
    length = len(target)
    return sum([abs(predict[i] - target[i]) for i in range(length)]) / float(length)

In [79]:
# using k-fold to divide the train and test set 
from sklearn.model_selection import KFold

n_splits = 10
kf = KFold(n_splits = n_splits, shuffle = True)

def k_fold(X, y, train_model):
    '''
        use k-fold to split the data and use train_model to train and predict
        return the average error of the model
    '''
    error = 0
    for train_index, test_index in kf.split(X):
        train_feature = []
        test_feature = []
        train_result = []
        test_result = []
        for index in train_index:
            train_feature.append(X[index])
            train_result.append(y[index])
        for index in test_index:
            test_feature.append(X[index])
            test_result.append(y[index])
            
        model = train_model.fit(train_feature, train_result)
        predict = model.predict(test_feature)
        error += get_error(predict, test_result)
    return error / n_splits

In [117]:
%%capture
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor

lr = linear_model.LinearRegression()
clf = MLPRegressor(alpha = 0.001, hidden_layer_sizes = (500,), \
                           activation = 'tanh', verbose = 'True', learning_rate = 'adaptive')

In [123]:
%%capture
time_range = {0: "Before Feb. 1, 8:00 a.m. ", 1: "Between Feb. 1, 8:00 a.m. and 8:00 p.m. ", \
              2: "After Feb. 1, 8:00 p.m."}
final_result = []
for f in files:
    file_data = load_file(f)
    for i in range(3):
        X, y = group_data(file_data[i])
        a = "the file is " + f + " , the time range is " + time_range[i] + \
        " and the error is " + str(k_fold(X, y, lr)) + \
        " and " + str(k_fold(X, y, clf))
        final_result.append(a)

In [124]:
print final_result

['the file is tweets_#gohawks.txt , the time range is Before Feb. 1, 8:00 a.m.  and the error is 298.78617575083246 and 229.96939577973026', 'the file is tweets_#gohawks.txt , the time range is Between Feb. 1, 8:00 a.m. and 8:00 p.m.  and the error is 5954.150163060607 and 5598.004875977421', 'the file is tweets_#gohawks.txt , the time range is After Feb. 1, 8:00 p.m. and the error is 24.45815507934528 and 37.66107208859721', 'the file is tweets_#gopatriots.txt , the time range is Before Feb. 1, 8:00 a.m.  and the error is 14.079448973180286 and 14.157146051560655', 'the file is tweets_#gopatriots.txt , the time range is Between Feb. 1, 8:00 a.m. and 8:00 p.m.  and the error is 871.3723275871664 and 1474.0190328947078', 'the file is tweets_#gopatriots.txt , the time range is After Feb. 1, 8:00 p.m. and the error is 3.6269646255435632 and 3.462097063680017', 'the file is tweets_#nfl.txt , the time range is Before Feb. 1, 8:00 a.m.  and the error is 125.22197523254142 and 189.13199051240

In [80]:
from sklearn import linear_model

lr = linear_model.LinearRegression()
print k_fold(X, y, lr)

303.02162181459596


In [115]:
%%capture
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(alpha = 0.001, hidden_layer_sizes = (500,), \
                           activation = 'tanh', verbose = 'True', learning_rate = 'adaptive')
x = k_fold(X, y, clf)

In [116]:
x

223.57008211494954