In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

data_path = '../raw_data/100k_data.csv'
df_all = pd.read_csv(data_path)
df_sample = df_all.dropna().sample(10000,random_state=0)

In [2]:
import pandas as pd
import numpy as np

def binary_cat_upvotes(original_df, threshold=30):
    """
    Takes column from df called 'upvotes' and returns df with new column
    'cat_upvotes' which is 1 if upvotes is above threshold, and 0 otherwise.
    """
    df = original_df.copy()
    if 'upvotes' not in original_df.columns:
        raise ValueError("df has no column named 'upvotes'")
    def trans(number):
        if number >= threshold:
            return 1
        else:
            return 0
    df['cat_upvotes'] = df['upvotes'].apply(trans)
    return df

def multi_cat_upvotes(original_df, int_list=[10,100,1000]):
    """
    Takes column from df and returns df with new
    column 'cat_upvotes' based on list passed as an argument
    """
    df = original_df.copy()
    def trans(number):
        for index, integer in enumerate(int_list):
            if number < integer:
                return index
        return len(int_list)
    df['cat_upvotes'] = df['upvotes'].apply(trans)
    return df

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import time
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

def basic(original_df,keep_timestamp=False):
    """
    Transforms 'time_stamp' column from df into individual components 'year',
    'month','day','weekday','hour','minute'
    """
    df = original_df.copy()

    if 'time_stamp' not in df.columns:
        raise ValueError("df has no column named 'time_stamp'")
    df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='s')

    df['year'] = df.time_stamp.dt.year
    df['month'] = df.time_stamp.dt.month
    df['day'] = df.time_stamp.dt.day
    df['weekday'] = df.time_stamp.dt.weekday
    df['hour'] = df.time_stamp.dt.hour
    df['minute'] = df.time_stamp.dt.minute

    if keep_timestamp is False:
        df = df.drop(columns='time_stamp')
    return df

def cyclize(original_df):
    """
    Transforms columns named 'month','day','hour','minute' into sin and cos
    cyclic values for use with machine learning models
    """
    df = original_df.copy()

    need_list = ['month','day','hour','minute']
    max_dict = {
        'month':12,
        'day': 31,
        'hour': 23,
        'minute': 59
    }

    for column in need_list:
        if column in df.columns:
            def sin_trans(number):
                return math.sin(number * (2. * math.pi / max_dict[column]))
            def cos_trans(number):
                return math.cos(number * (2. * math.pi / max_dict[column]))
            df['sin_' + column] = df[column].apply(sin_trans)
            df['cos_' + column] = df[column].apply(cos_trans)
            df = df.drop(columns=column, axis=1)

    return df

def encode_weekday(original_df, keep_weekday_column=False):
    """
    OneHotEncodes column from df column named 'weekday'
    """
    df = original_df.copy()

    enc = OneHotEncoder(handle_unknown='ignore')
    df_wkdy = pd.DataFrame(enc.fit_transform(df[['weekday']]).toarray())
    df = pd.concat([df.reset_index(), df_wkdy], axis=1)
    df = df.set_index('index')
    if keep_weekday_column==False:
        df = df.drop('weekday', axis=1)
    return df

def transform_timestamp(original_df):
    """
    Takes 'time_stamp' column from df and returns df preprocessed and
    ready for machine learning
    """
    df = original_df.copy()
    df = basic(df)
    df = cyclize(df)
    df = encode_weekday(df)
    if 'year' in df.columns:
        scaler = MinMaxScaler()
        df['year'] = scaler.fit_transform(df[['year']].copy())
    return df


In [3]:
df_base = df_all[['time_stamp','upvotes']]
df_base

Unnamed: 0,time_stamp,upvotes
0,1.668520e+09,1
1,1.668519e+09,16
2,1.668519e+09,31
3,1.668519e+09,13
4,1.668519e+09,1
...,...,...
99261,1.495833e+09,102
99262,1.495830e+09,12
99263,1.495830e+09,139
99264,1.495828e+09,30


In [4]:
df_base = transform_timestamp(df_base)
df_base

Unnamed: 0_level_0,upvotes,year,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute,0,1,2,3,4,5,6
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.940700,-0.339239,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,16,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.638244,-0.769834,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,31,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.461093,-0.887352,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,13,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.263103,-0.964768,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.364161,-0.931336,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99261,102,0.0,0.5,-0.866025,-0.848644,0.528964,-0.519584,0.854419,0.678312,0.734774,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99262,12,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.638244,-0.769834,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99263,139,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.999646,-0.026621,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99264,30,0.0,0.5,-0.866025,-0.848644,0.528964,-0.887885,0.460065,-0.899312,-0.437307,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
df_base = binary_cat_upvotes(df_base)
df_base

Unnamed: 0_level_0,upvotes,year,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute,0,1,2,3,4,5,6,cat_upvotes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.940700,-0.339239,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,16,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.638244,-0.769834,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,31,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.461093,-0.887352,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,13,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.263103,-0.964768,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,1,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.364161,-0.931336,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99261,102,0.0,0.5,-0.866025,-0.848644,0.528964,-0.519584,0.854419,0.678312,0.734774,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
99262,12,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.638244,-0.769834,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
99263,139,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.999646,-0.026621,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
99264,30,0.0,0.5,-0.866025,-0.848644,0.528964,-0.887885,0.460065,-0.899312,-0.437307,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [6]:
df_base = df_base.drop('upvotes',axis=1)
df_base

Unnamed: 0_level_0,year,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute,0,1,2,3,4,5,6,cat_upvotes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.940700,-0.339239,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.638244,-0.769834,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.461093,-0.887352,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.263103,-0.964768,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.364161,-0.931336,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99261,0.0,0.5,-0.866025,-0.848644,0.528964,-0.519584,0.854419,0.678312,0.734774,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
99262,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.638244,-0.769834,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
99263,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.999646,-0.026621,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
99264,0.0,0.5,-0.866025,-0.848644,0.528964,-0.887885,0.460065,-0.899312,-0.437307,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [7]:
X = df_base.drop('cat_upvotes',axis=1)
y = df_base['cat_upvotes']

In [8]:
y.head()

index
0    0
1    0
2    1
3    0
4    0
Name: cat_upvotes, dtype: int64

In [9]:
X.columns

Index([      'year',  'sin_month',  'cos_month',    'sin_day',    'cos_day',
         'sin_hour',   'cos_hour', 'sin_minute', 'cos_minute',            0,
                  1,            2,            3,            4,            5,
                  6],
      dtype='object')

In [10]:
X = X.rename(columns={0:'0',1:'1',2:'2',3:'3',4:'4',5:'5',6:'6'})

In [13]:
above_threshold = df_base['cat_upvotes'].sum()

In [14]:
total = df_base['cat_upvotes'].count()

In [15]:
(total - above_threshold) / total #baseline score

0.7432252735075454

In [18]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression().fit(X, y)
clf.score(X, y)

0.009033480697797636

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

knn_pipe = Pipeline([('knn', KNeighborsClassifier())])

for k in range(3,10):
    params = [{'knn__n_neighbors': [k],
                 'knn__weights': ['distance'],
                 'knn__leaf_size': [10]}]

    gs_knn = GridSearchCV(knn_pipe,
                              param_grid=params,
                              scoring='accuracy',
                              cv=2)

    gs_knn.fit(X_train, y_train)
    print(f"k = {k}, Score: {round(gs_knn.score(X_train, y_train),3)}, {round(gs_knn.score(X_test, y_test),3)}")

k = 3, Score: 0.992, 0.673
k = 4, Score: 0.992, 0.679
k = 5, Score: 0.992, 0.692
k = 6, Score: 0.992, 0.697
k = 7, Score: 0.992, 0.702
k = 8, Score: 0.992, 0.705
k = 9, Score: 0.992, 0.708


In [87]:
gs_knn.predict(X_train).sum()

3770

In [46]:
X_train

Unnamed: 0_level_0,year,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute,0,1,2,3,4,5,6
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
41002,0.6,-8.660254e-01,-0.500000,0.988468,0.151428,2.697968e-01,0.962917,0.921312,0.388824,0.0,0.0,0.0,0.0,1.0,0.0,0.0
37450,0.6,-8.660254e-01,0.500000,-0.299363,-0.954139,-9.790841e-01,0.203456,0.818303,0.574787,0.0,0.0,0.0,0.0,0.0,1.0,0.0
34515,0.6,-2.449294e-16,1.000000,0.299363,-0.954139,-9.790841e-01,0.203456,0.847734,-0.530421,1.0,0.0,0.0,0.0,0.0,0.0,0.0
35373,0.6,-5.000000e-01,0.866025,-0.571268,0.820763,-2.697968e-01,0.962917,-0.106293,0.994335,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10636,1.0,8.660254e-01,-0.500000,-0.485302,-0.874347,7.308360e-01,0.682553,0.678312,0.734774,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28296,0.8,8.660254e-01,-0.500000,-0.571268,0.820763,-2.449294e-16,1.000000,-0.507666,0.861554,0.0,0.0,1.0,0.0,0.0,0.0,0.0
91191,0.0,-5.000000e-01,0.866025,0.571268,0.820763,-5.195840e-01,0.854419,-0.053222,-0.998583,0.0,0.0,0.0,0.0,1.0,0.0,0.0
74249,0.2,-8.660254e-01,0.500000,0.790776,-0.612106,-9.976688e-01,-0.068242,-0.263103,-0.964768,0.0,0.0,0.0,1.0,0.0,0.0,0.0
24352,0.8,-5.000000e-01,-0.866025,-0.299363,-0.954139,2.697968e-01,0.962917,0.982684,0.185289,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [54]:
gs_knn.best_params_

{'knn__leaf_size': 12, 'knn__n_neighbors': 6, 'knn__weights': 'distance'}