In [7]:
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta, TH
from sklearn import preprocessing, cross_validation, svm,linear_model
from sklearn.preprocessing import MinMaxScaler

#data = pd.read_csv('Input/tv18_5years.csv', header=0,index_col=False)

filepath = 'Input/tv18_all.csv'


def define_input(filepath):
    data = pd.read_csv(filepath, header=0,index_col=False)
    data = data[['Date','Open','High','Low','Close','Volume']]
    data.fillna(value=-99999, inplace=True)
    return data

def calc_last_thursday(year_mon):

    input_date = pd.to_datetime(year_mon + ('-01'))
    cmon = input_date.month
    
    for i in range(1, 6):
        t = input_date + relativedelta(weekday=TH(i))
        if t.month != cmon:
            # since t is exceeded we need last one  which we can get by subtracting -2 since it is already a Thursday.
            t = t + relativedelta(weekday=TH(-2))
            break
    return t.strftime('%Y-%m-%d')


def build_split_data(data,input_field):
    all_dates = data[['Date']].loc[1:]
    date_month = all_dates['Date'].str[0:7].drop_duplicates()
    option_end_Date = []
    for i in date_month:    
        option_end_Date.append(calc_last_thursday(i))   
    
    l=[]
    for i in range(1,len(data)):
        p = i-1
        if (data[input_field].loc[p] != -99999.000000) or (data[input_field].loc[p] != "null"):   
            oed = 1 if (str(data['Date'].loc[i])) in option_end_Date else 0      
            if input_field == 'Close':
                    k= data['Date'].loc[i],data[input_field].loc[i],data['Open'].loc[p],data['High'].loc[p],data['Low'].loc[p],data['Volume'].loc[p],oed
            elif input_field == 'High':
                    k= data['Date'].loc[i],data[input_field].loc[i],data['Open'].loc[p],data['Close'].loc[p],data['Low'].loc[p],data['Volume'].loc[p],oed
            elif input_field == 'Low':
                    k= data['Date'].loc[i],data[input_field].loc[i],data['Open'].loc[p],data['Close'].loc[p],data['High'].loc[p],data['Volume'].loc[p],oed
            elif input_field == 'Open':
                    k= data['Date'].loc[i],data[input_field].loc[i],data['Close'].loc[p],data['High'].loc[p],data['Low'].loc[p],data['Volume'].loc[p],oed
            l.append(k)    

        
    if input_field =='Close':
        labels= ['Date',input_field,'Open','High','Low','Volume',"Option EndDt Ind"]
    elif input_field == 'High':
        labels= ['Date',input_field,'Open','Close','Low','Volume',"Option EndDt Ind"]
    elif input_field == 'Low':
        labels= ['Date',input_field,'Open','Close','High','Volume',"Option EndDt Ind"]
    elif input_field == 'Open':
        labels= ['Date',input_field,'Close','High','Low','Volume',"Option EndDt Ind"]
    df2 = pd.DataFrame.from_records(l, columns=labels)
    #df3=y_label.join(df2.set_index('Date'), on='Date')

    split_data = np.array(df2.drop(['Date'], 1))
    return split_data

def cross_validate(split_data):
    n=len(split_data)
    train_start = 0
    train_end = int(np.floor(0.9*n))
    test_start = train_end
    test_end = n
    data_train = split_data[np.arange(train_start, train_end), :]
    data_test = split_data[np.arange(test_start, test_end), :]
    y_train = data_train[:,0]
    y_test = data_test[:,0]

    X_train =data_train[:,1:]
    X_test =data_test[:,1:]
    
    return X_train,y_train,X_test,y_test

def run_classifier(X_train,y_train,X_test,y_test):

    classifiers = [svm.SVR(),
                   linear_model.LinearRegression(),
                   linear_model.BayesianRidge(),
                   linear_model.ARDRegression(),
                   linear_model.TheilSenRegressor()
                  ]

    for i in classifiers:
        print("running for: %s" % i )
        regression = i
        regression.fit(X_train, y_train)
        predicted = regression.predict(X_test)
        for i in range(len(y_test)-1,len(y_test)-2,-1):
            print(np.round(predicted[i],2),np.round(y_test[i],2))
        print("Mean squared error: %.2f" % np.mean((predicted - y_test) ** 2))
        print('Variance score: %.2f' % regression.score(X_test, y_test))

# Training and test data preparation
#df3=df1.join(df2.set_index('Date'), on='Date')

def predict_value_for(input_field):
    data = define_input(filepath)
    print('Running For Input Field:  ', input_field)
    split_data = build_split_data(data,input_field)

    X_train,y_train,X_test,y_test = cross_validate(split_data)
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    run_classifier(X_train,y_train,X_test,y_test)
    
if __name__ =='__main__' :
    values = ['Close','High','Low','Open']
    for v in values:        
        predict_value_for(v)      

Running For Input Field:   Close
running for: SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
46.57 48.25
Mean squared error: 3.36
Variance score: 0.96
running for: LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
48.1 48.25
Mean squared error: 2.24
Variance score: 0.98
running for: BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
48.1 48.25
Mean squared error: 2.24
Variance score: 0.98
running for: ARDRegression(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, threshold_lambda=10000.0, tol=0.001, verbose=False)
48.08 48.25
Mean squared error: 2.22
Variance score: 0.98
running for: TheilSenRegressor(copy_X=

In [9]:
data = define_input(filepath)
y_label = data[['Date']].loc[1:]

In [10]:
print(y_label.head())

         Date
1  2006-12-29
2  2007-01-02
3  2007-01-03
4  2007-01-04
5  2007-01-05
