In [4]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import time
import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

data_path = '../raw_data/100k_data.csv'
df_all = pd.read_csv(data_path)
df_all = df_all.dropna()

def binary_cat_upvotes(original_df, threshold=30):
    """
    Takes column from df called 'upvotes' and returns df with new column
    'cat_upvotes' which is 1 if upvotes is above threshold, and 0 otherwise.
    """
    df = original_df.copy()
    if 'upvotes' not in original_df.columns:
        raise ValueError("df has no column named 'upvotes'")
    def trans(number):
        if number >= threshold:
            return 1
        else:
            return 0
    df['cat_upvotes'] = df['upvotes'].apply(trans)
    return df

def multi_cat_upvotes(original_df, int_list=[10,100,1000]):
    """
    Takes column from df and returns df with new
    column 'cat_upvotes' based on list passed as an argument
    """
    df = original_df.copy()
    def trans(number):
        for index, integer in enumerate(int_list):
            if number < integer:
                return index
        return len(int_list)
    df['cat_upvotes'] = df['upvotes'].apply(trans)
    return df

def basic(original_df,keep_timestamp=False):
    """
    Transforms 'time_stamp' column from df into individual components 'year',
    'month','day','weekday','hour','minute'
    """
    df = original_df.copy()

    if 'time_stamp' not in df.columns:
        raise ValueError("df has no column named 'time_stamp'")
    df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='s')

    df['year'] = df.time_stamp.dt.year
    df['month'] = df.time_stamp.dt.month
    df['day'] = df.time_stamp.dt.day
    df['weekday'] = df.time_stamp.dt.weekday
    df['hour'] = df.time_stamp.dt.hour
    df['minute'] = df.time_stamp.dt.minute

    if keep_timestamp is False:
        df = df.drop(columns='time_stamp')
    return df

def cyclize(original_df):
    """
    Transforms columns named 'month','day','hour','minute' into sin and cos
    cyclic values for use with machine learning models
    """
    df = original_df.copy()

    need_list = ['month','day','hour','minute']
    max_dict = {
        'month':12,
        'day': 31,
        'hour': 23,
        'minute': 59
    }

    for column in need_list:
        if column in df.columns:
            def sin_trans(number):
                return math.sin(number * (2. * math.pi / max_dict[column]))
            def cos_trans(number):
                return math.cos(number * (2. * math.pi / max_dict[column]))
            df['sin_' + column] = df[column].apply(sin_trans)
            df['cos_' + column] = df[column].apply(cos_trans)
            df = df.drop(columns=column, axis=1)

    return df

def encode_weekday(original_df, keep_weekday_column=False):
    """
    OneHotEncodes column from df column named 'weekday'
    """
    df = original_df.copy()

    enc = OneHotEncoder(handle_unknown='ignore')
    df_wkdy = pd.DataFrame(enc.fit_transform(df[['weekday']]).toarray())
    df = pd.concat([df.reset_index(), df_wkdy], axis=1)
    df = df.set_index('index')
    if keep_weekday_column==False:
        df = df.drop('weekday', axis=1)
    return df

def transform_timestamp(original_df):
    """
    Takes 'time_stamp' column from df and returns df preprocessed and
    ready for machine learning
    """
    df = original_df.copy()
    df = basic(df)
    df = cyclize(df)
    df = encode_weekday(df)
    if 'year' in df.columns:
        scaler = MinMaxScaler()
        df['year'] = scaler.fit_transform(df[['year']].copy())
    return df

df = df_all[['time_stamp','upvotes']]
df = transform_timestamp(df)
df = binary_cat_upvotes(df, threshold=30)
df = df.drop(columns='upvotes')

df

Unnamed: 0_level_0,year,sin_month,cos_month,sin_day,cos_day,sin_hour,cos_hour,sin_minute,cos_minute,0,1,2,3,4,5,6,cat_upvotes
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.940700,-0.339239,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.638244,-0.769834,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,-0.461093,-0.887352,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
3,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.263103,-0.964768,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,-0.5,0.866025,0.101168,-0.994869,-0.398401,-0.917211,0.364161,-0.931336,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99260,0.0,0.5,-0.866025,-0.848644,0.528964,-0.519584,0.854419,0.874763,0.484551,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
99262,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.638244,-0.769834,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
99263,0.0,0.5,-0.866025,-0.848644,0.528964,-0.730836,0.682553,0.999646,-0.026621,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
99264,0.0,0.5,-0.866025,-0.848644,0.528964,-0.887885,0.460065,-0.899312,-0.437307,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [5]:
X = df.drop(columns='cat_upvotes')
y = df.cat_upvotes

In [7]:
filename = 'timestamp_model.pickle'

In [9]:
import pickle
loaded_model = pickle.load(open(filename, 'rb'))

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2022-11-25 10:09:38         5218
metadata.json                                  2022-11-25 10:09:38           64
variables.h5                                   2022-11-25 10:09:38       218992


ValueError: All `axis` values to be kept must have known shape. Got axis: (-1,), input shape: [None, None], with unknown axis at index: 1

In [None]:
result = loaded_model.score(X_test, Y_test)
print(result)