In [1]:
import os
import datetime as dt
from datetime import datetime, timedelta
import time
import sqlite3
import re
import math
import numpy as np
from scipy import stats
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import sklearn.linear_model as lm
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import metrics,preprocessing,cross_validation
from sklearn.metrics import f1_score
import pandas as pd

###Modeling

In [2]:
def data_clean():
    
    ## ==== Clean data ==== ##
    
    URLS, fulltime, date, time, duration = pull_history(fakehistory)
    df=pd.DataFrame(URLS,columns=['URL'])
    df['time'] = time
    df['freq'] = df.groupby('URL')['URL'].transform('count')
    df['duration'] = duration

    ##Only considering top 16 visited sites for now
    ##Get rid of any rows with time duration of 0

    filt = []
    for i in df.freq:
        filt.append(i)
        
    filt = list(set(filt))
    filt.sort(reverse = True)
    
    df = df[df.duration != 0]
    df = df[df.freq > np.amin(filt[0:18])]
    df = df[['URL', 'time']]
    df = df.reset_index(drop=True)
    
    return df

In [3]:
def data_prep(cleandf):
    
    ## ==== Prep data for modeling ==== ##
    
    sites = list(cleandf.URL)
    unique_sites = list(set(cleandf.URL))
    train_set = {}
    train_inverse_set = {}
    count = 0
    for i in unique_sites:
        count += 1
        train_set[i] = count
        train_inverse_set[count] = i
        
    X = np.array(list(cleandf.time))
    X = train_x.reshape((len(train_x),1))
    y = [train_set[i] for i in sites]
    
    return X, y, train_inverse_set

In [4]:
SEED = 23
bestmodel = {}

## ==== Training & Metrics ==== ##

cleandf = data_clean()
X, y, train_inverse_set = data_prep(cleandf)

n = 10  #repeat 10 times for more precise results
for i in range(n):
    
    #split dataset into training, validation, testing with ratio of 3:1:1
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=.4, random_state=i*SEED)
    
    X_test, X_val, y_test, y_val = cross_validation.train_test_split(
        X_test, y_test, test_size=.5, random_state=i*SEED)

################################################################################
## The following tests potential models via cross validation 
## Want to find the optimal values for the tuning parameters
## Returns parameter with best performance(using a F score as the cross-validation metric)

#** Used F-score because it has a nice balance between precision and recall in multiclass classification
        
# pick model
modelname = "randomforest"

################################################################################

if modelname == "randomforest":
    C = np.linspace(10, 300, num = 30)
    models = [RandomForestClassifier(n_estimators = int(c)) for c in C]

if modelname == "gradientboost":
    C = np.linspace(10, 300, num = 30)
    models = [GradientBoostingClassifier(n_estimators = int(c)) for c in C]
    
#### Testing other models in the future

## calculate scores 
cv_scores = [0] * len(models)
for i, model in enumerate(models):
    cv_scores[i] = np.mean(cross_validation.cross_val_score(
            model, X_val, y_val, cv=5, scoring='f1_weighted'))

best = cv_scores.index(max(cv_scores))
best_c = C[best]
best_cv = cv_scores[best]

bestmodel[modelname] = [best_c, best_cv]

for k,v in bestmodel.items():
    print "For %s: Best C = %f; F-score = %f" %(k, v[0], v[1]) 

For randomforest: Best C = 170.000000; F-score = 0.628359
For gradientboost: Best C = 10.000000; F-score = 0.682377


-----------------------------
#### F-scores were not very good.
Went ahead and used best model(gradient boosting) for now.

(ideas for improving: combining models, one-hot encoding with a different model/approach?)

In [5]:
## ==== Training & Metrics cont'd ==== ##

gb = GradientBoostingClassifier(n_estimators = 10)
eval_model = gb.fit(X_train,y_train)
eval_predy = eval_model.predict(X_test)

# Compute F-score metric
print f1_score(y_test, eval_predy, average='weighted')

0.686194393123


In [6]:
## ==== Predictions ==== ##

#get current time
time_now = str(time.strftime('%H:%M'))
adjusted_time_now = [adjusted_time(time.strftime('%H:%M'))]

#retrained the model on the whole dataset when making actual prediction
model_all = gb.fit(X,y)
prediction = model_all.predict(adjusted_time_now)

for p in prediction:
    print "Recommended site at %s: %s" %(adjusted_time_now, train_inverse_set[p])

Recommended site at 18.49: www.reddit.com


--------------------------
####Related work for future

In [None]:
## === Return the most frequently visited website on a specific day. === ##  

df=pd.DataFrame(URLS,columns=['URL'])
df['freq'] = df.groupby('URL')['URL'].transform('count')
df['dates'] = dates
df =  df.sort('freq', ascending = False)
df = df.reset_index(drop=True) 
mode = lambda x: x.mode() if len(x) > 2 else np.array(x)
df = df.groupby('dates')['URL'].agg(mode)
days = dict(df)

today = str(time.strftime("%A"))
print "Most visited site on today(%s) is: %s" %(today, days[today])

Most visited site on today(Monday) is: youtube.com


In [None]:
## === One-hot encoding === ##
#encode the category IDs

X, prep_y, train_inverse_set = data_prep(cleandf)

y = [[prep_y[i]] for i in xrange(len(prep_y))]

encoder = preprocessing.OneHotEncoder()
encoder.fit(y)
y = encoder.transform(y)