In [80]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [81]:
train_data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')

In [82]:
train_data.shape

(11997, 69)

In [83]:
test_data.shape

(4161, 68)

In [84]:
train_data.columns

Index(['ID', 'Date', 'Company ', 'SMA', 'EMA', 'WMA', 'DEMA', 'TEMA', 'TRIMA',
       'KAMA', 'FAMA', 'MAMA', 'T3', 'MACD', 'MACD_Hist', 'MACD_Signal', 'MAC',
       'MAC_Hist', 'MAC_Signal', 'SlowD', 'SlowK', 'FastD', 'FastK', 'RSI',
       'FatD', 'FatK', 'WILLR', 'ADX', 'ADXR', 'APO', 'PPO', 'MOM', 'BOP',
       'CCI', 'CMO', 'ROC', 'ROCR', 'Aroon Down', 'Aroon Up', 'AROONOSC',
       'MFI', 'TRIX', 'ULTOSC', 'DX', 'MINUS_DI', 'PLUS_DI', 'MINUS_DM',
       'PLUS_DM', 'Real Lower Band', 'Real Middle Band', 'Real Upper Band',
       'MIDPOINT', 'MIDPRICE', 'SAR', 'TRANGE', 'ATR', 'NATR', 'Chaikin A/D',
       'ADOSC', 'OBV', 'HT_TRENDLINE', 'LEAD SINE', 'SINE', 'TRENDMODE',
       'DCPERIOD', 'HT_DCPHASE', 'PHASE', 'QUADRATURE', 'Price'],
      dtype='object')

In [85]:
test_data.columns

Index(['ID', 'Date', 'Company ', 'SMA', 'EMA', 'WMA', 'DEMA', 'TEMA', 'TRIMA',
       'KAMA', 'FAMA', 'MAMA', 'T3', 'MACD', 'MACD_Hist', 'MACD_Signal', 'MAC',
       'MACDHist', 'MAC_Signal', 'SlowD', 'SlowK', 'FastD', 'FastK', 'RSI',
       'FatD', 'FatK', 'WILLR', 'ADX', 'ADXR', 'APO', 'PPO', 'MOM', 'BOP',
       'CCI', 'CMO', 'ROC', 'ROCR', 'Aroon Down', 'Aroon Up', 'AROONOSC',
       'MFI', 'TRIX', 'ULTOSC', 'DX', 'MINUS_DI', 'PLUS_DI', 'MINUS_DM',
       'PLUS_DM', 'Real Lower Band', 'Real Middle Band', 'Real Upper Band',
       'MIDPOINT', 'MIDPRICE', 'SAR', 'TRANGE', 'ATR', 'NATR', 'Chaikin A/D',
       'ADOSC', 'OBV', 'HT_TRENDLINE', 'LEAD SINE', 'SINE', 'TRENDMODE',
       'DCPERIOD', 'HT_DCPHASE', 'PHASE', 'QUADRATURE'],
      dtype='object')

In [86]:
col = list(train_data.columns)
col.remove('Company ')
col.remove('Real Lower Band')
col.remove('Price')
train_data.drop(col, axis=1, inplace=True)
col.remove('ID')
col.remove('MAC_Hist')
col.append('MACDHist')
test_data.drop(col, axis=1, inplace=True)

In [87]:
train_data.head()

Unnamed: 0,Company,Real Lower Band,Price
0,ABC,,18.81
1,DEF,,57.5
2,GHI,,135.25
3,ABC,0.75,19.06
4,DEF,2.5,59.44


In [88]:
test_data.head()

Unnamed: 0,ID,Company,Real Lower Band
0,12001,ABC,516.9314
1,12002,DEF,372.7517
2,12003,GHI,37.3269
3,12004,ABC,527.1947
4,12005,DEF,376.2984


In [89]:
train_abc = train_data.loc[train_data['Company ']=='ABC']
train_abc.drop('Company ', axis=1, inplace=True)
test_abc = test_data.loc[test_data['Company ']=='ABC']
test_abc.drop('Company ', axis=1, inplace=True)

train_def = train_data.loc[train_data['Company ']=='DEF']
train_def.drop('Company ', axis=1, inplace=True)
test_def = test_data.loc[test_data['Company ']=='DEF']
test_def.drop('Company ', axis=1, inplace=True)

train_ghi = train_data.loc[train_data['Company ']=='GHI']
train_ghi.drop('Company ', axis=1, inplace=True)
test_ghi = test_data.loc[test_data['Company ']=='GHI']
test_ghi.drop('Company ', axis=1, inplace=True)

del train_data, test_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [90]:
def handle_nan(df):
    df = df.fillna(0)
    return df

train_abc = handle_nan(train_abc)
train_def = handle_nan(train_def)
train_ghi = handle_nan(train_ghi)

In [91]:
def build_model(df_tr, df_te, eval=False):
    df_train = df_tr.copy()
    df_test = df_te.copy()
    x = np.array(df_train.pop('Real Lower Band')).reshape(-1,1)
    y = np.array(df_train.pop('Price'))
    x_pred = np.array(df_test.pop('Real Lower Band')).reshape(-1,1)
    
    if eval:
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
        model = LinearRegression().fit(x_train, y_train)
        y_pred = model.predict(x_test)
        error = mean_absolute_error(y_test, y_pred)
        return error
    model = LinearRegression().fit(x, y)
    y_pred = model.predict(x_pred)
    df_test['Price'] = y_pred
    return df_test

In [92]:
error_abc = build_model(train_abc, test_abc, eval=True)
error_def = build_model(train_def, test_def, eval=True)
error_ghi = build_model(train_ghi, test_ghi, eval=True)

print((error_abc+error_def+error_ghi)/3)

5.533924928714817


In [93]:
pred_abc = build_model(train_abc, test_abc)
pred_def = build_model(train_def, test_def)
pred_ghi = build_model(train_ghi, test_ghi)

pred = pd.concat([pred_abc, pred_def, pred_ghi])

In [94]:
pred = pred.sort_values(by='ID')

In [95]:
pred.head()

Unnamed: 0,ID,Price
0,12001,540.938841
1,12002,393.67819
2,12003,39.600193
3,12004,551.628879
4,12005,397.389902


In [96]:
pred.to_csv('result.csv', index=False)