In [1]:
import datetime
import numpy as np
import pandas as pd
import math

# Prediction period
NEXT_ONE_MONTH = 30
NEXT_TWO_MONTH = 60
NEXT_THREE_MONTH = 90
NEXT_SIX_MONTH = 180

# Num of Classifiers
NUM_LABELS = 10

# data frame we need to process
dataset = pd.read_pickle("Step1_RawY_variable_and_processing.pkl")

In [2]:
# Params:
# val: is the value that should be classified
# num_labels: is the number of labels we will use for classification
# min_val: is the minimal value of the range
# max_val: is the maximal value of the range

# Example:
# num_labels = 5
# min_val = 0
# max_val = 1
# Then the ranges will be [0, 0.2, 0.4, 0.6, 0.8, 1.0]
#       labels are       0   1    2   3    4    5     6  where label 1 is for range [0, 0.2)

# Only label 1 to 5 are valid.
# if the return lable is < 1 or > num_labels, the given val is out of the classifying range.

def label(val, num_labels, min_val, max_val):
    precision = (max_val - min_val) / num_labels
    ranges = np.zeros(num_labels+1)
    
    tmp = min_val
    for i in range(num_labels+1):
        ranges[i] = tmp
        if (val < ranges[i]):
            return i
        tmp += precision
    
    return num_labels+1
 
# test:
print(label(-0.1, 5, 0, 1))
print(label(0.1, 5, 0, 1))
print(label(1.1, 5, 0, 1))
    

0
1
6


In [3]:
# Params:
# label: is the label, which is a number from 1 to num_labels. 
#        label = 0 when the output is lower than base range, or label = num_labels+1 when greater than the base range
# num_labels: number of the classifier in use
# base_val_range: is a tuple representing the value range, where the first number is the min, and the second is max.

# Returns:
# the exact value range that the given label is representing for. (Tuple)
def deLabel(label, num_labels, base_val_range):
    # out-of-range
    if (label == 0):
        return (-1, base_val_range[0])
    if(label == num_labels+1):
        return (base_val_range[1], 1)
    
    # in-range
    precision = (base_val_range[1] - base_val_range[0]) / num_labels
    return (base_val_range[0] + (label-1)*precision, base_val_range[0] + label*precision)

# Test
print(deLabel(3, 5, (0,0.1)))
    

(0.04, 0.06)


In [5]:
# Classfying Y into classes

# How we define classifier:
# If we predict for next month, the output (Y) for training data is classfied by the price range of their previous
# one-month prices.

def labelY(dataset, predict_period):
    if(predict_period == NEXT_ONE_MONTH): 
        range_name = 'Pre1MonthDailyPriceRange'
        label_name = 'next1Month_DailyRet_Label'
        #val_name = 'next1MonthAvgDailyRet'
        #label_name = 'next1MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_TWO_MONTH):
        range_name = 'Pre2MonthDailyPriceRange'
        label_name = 'next2Month_DailyRet_Label'
        #val_name = 'next2MonthAvgDailyRet'
        #label_name = 'next2MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_THREE_MONTH):
        range_name = 'Pre3MonthDailyPriceRange'
        label_name = 'next3Month_DailyRet_Label'
        #val_name = 'next3MonthAvgDailyRet'
        #label_name = 'next3MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_SIX_MONTH):
        range_name = 'Pre6MonthDailyPriceRange'
        label_name = 'next6Month_DailyRet_Label'
        #val_name = 'next6MonthAvgDailyRet'
        #label_name = 'next6MonthAvgDailyRet_Label'
    val_name = 'Daily_Return'

    df = dataset[[val_name, range_name]]
    labels = np.zeros(len(df))
    
    i = 0
    for index, row in df.iterrows():
        if (math.isnan(row[val_name]) or pd.isnull(row[range_name])):
            newlabel = np.nan
        else:
            newlabel = label(row[val_name], NUM_LABELS, row[range_name][0], row[range_name][1])
        labels[i] = newlabel
        i += 1
    
    dataset[label_name] = labels

    
# Labeling Y outputs for all dates, and store them into file 'Step2_Y_variable.pkl'
labelY(dataset, NEXT_ONE_MONTH)
labelY(dataset, NEXT_TWO_MONTH)
labelY(dataset, NEXT_THREE_MONTH)
labelY(dataset, NEXT_SIX_MONTH)
dataset.to_pickle('./Step2_Y_variable.pkl')
dataset.to_csv('./Step2_Y_variable.csv')

In [6]:
# Params:
# start_date: String, is the start date (included) of the prediction period
# predict_period: Int, is the time range we want to predict. 
#                 e.g. next month, next 3 months, next 6 months, etc.
# time_range_len: Int, is the time range of the training data, e.g. 365 stands for 1 year.
# dataset: DataFrame, is the cleaned-up data frame, i.e. "Step2_Y_variable.pkl"

# Returns:
# training_Ys: DataFrame, training data with Y marked with classifier for the given 'time_range_len', 
# i.e each training output is classified according to the previous 'predict_period' output range.

# Example:
# start_date = 2018-10-01
# predict_period = 30 (next month)
# time_range_len = 365 (1 year)
# data set = cleaned data frame
# Then the time period we want to predict is : 2018-10-01 to 2018-10-30,
# and the training data should be in range: 2017-09-01 to 2018-09-01
# Note: we can't train on 2018-09-02 to 2018-09-30, becasue their next-month classification needs data from 
# the dates in our prediction period, which is clearly not available for training.
# 


def getTrainingYs(start_date, predict_period, time_range_len, dataset):
    startdate = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    #print(startdate)
    
    # compute training data time range
    training_end_date = startdate - datetime.timedelta(days=predict_period)
    training_start_date = training_end_date - datetime.timedelta(days=time_range_len)
    
    #print("training start date: ", training_start_date)
    #print("training end date: ", training_end_date)
    
    training_dataframe = dataset[(dataset.index <= training_end_date) & 
                                 (dataset.index >= training_start_date)]
    #print(training_dataframe)
     
    if(predict_period == NEXT_ONE_MONTH): 
        label_name = 'next1Month_DailyRet_Label'
        #label_name = 'next1MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_TWO_MONTH):
        label_name = 'next2Month_DailyRet_Label'
        #label_name = 'next2MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_THREE_MONTH):
        label_name = 'next3Month_DailyRet_Label'
        #label_name = 'next3MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_SIX_MONTH):
        label_name = 'next6Month_DailyRet_Label'
        #label_name = 'next6MonthAvgDailyRet_Label'
        
    training_Ys = training_dataframe[[label_name]]
    return training_Ys

# Test
sourcedata = pd.read_pickle('Step2_Y_variable.pkl')
training_data = getTrainingYs("2018-05-01", NEXT_TWO_MONTH, 365, sourcedata)
print(training_data)


            next2Month_DailyRet_Label
Date                                 
2017-03-02                        3.0
2017-03-03                        8.0
2017-03-06                        6.0
2017-03-07                        5.0
2017-03-08                        0.0
2017-03-09                        5.0
2017-03-10                        5.0
2017-03-13                        7.0
2017-03-14                        6.0
2017-03-15                       11.0
2017-03-16                        7.0
2017-03-17                        8.0
2017-03-20                        6.0
2017-03-21                        5.0
2017-03-22                        8.0
2017-03-23                        7.0
2017-03-24                        8.0
2017-03-27                        7.0
2017-03-28                       11.0
2017-03-29                       10.0
2017-03-30                        9.0
2017-03-31                        8.0
2017-04-03                        6.0
2017-04-04                        9.0
2017-04-05  

In [7]:
# Get the exact value range for the Y value of the given date

# Params:
# date: is a DateTime
# y: is the output value for the given date
# predict_period: is the output type for the y
# dataset: is the dataframe storing the labeling info

# Returns:
# rangeForY: a tuple representing the exact value range, [min, max)

def getDailyRetRange(date, y, predict_period, dataset):
    row = dataset[dataset.index == date]
    
    if(predict_period == NEXT_ONE_MONTH): 
        range_name = 'Pre1MonthDailyPriceRange'
        #val_name = 'next1MonthAvgDailyRet'
        #label_name = 'next1MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_TWO_MONTH):
        range_name = 'Pre2MonthDailyPriceRange'
        #val_name = 'next2MonthAvgDailyRet'
        #label_name = 'next2MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_THREE_MONTH):
        range_name = 'Pre3MonthDailyPriceRange'
        #val_name = 'next3MonthAvgDailyRet'
        #label_name = 'next3MonthAvgDailyRet_Label'
    elif (predict_period == NEXT_SIX_MONTH):
        range_name = 'Pre6MonthDailyPriceRange'
        #val_name = 'next6MonthAvgDailyRet'
        #label_name = 'next6MonthAvgDailyRet_Label'
    
    rangeForY = deLabel(y, NUM_LABELS, row[range_name].values[0])
    return rangeForY
    
# test
datestr = "2018-05-01"
date = datetime.datetime.strptime(datestr, "%Y-%m-%d")

dataset = pd.read_pickle('Step2_Y_variable.pkl')

getDailyRetRange(date, 1, NEXT_ONE_MONTH, dataset)
    

(-0.028056112224449037, -0.021969743904212323)