In [1]:
import numpy as np 
import pandas as pd
import os

from sklearn import preprocessing, cross_validation
from sklearn.linear_model import LinearRegression



### View the data

In [2]:
DATA_PATH = os.path.join('data', 'ShanghaiCompositeIndex.csv')
df_original = pd.read_csv(DATA_PATH, encoding='gbk')[:-1]
# reverse the data
df = df_original.iloc[::-1]
df.head()
# df.tail()

Unnamed: 0,日期,股票代码,名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,成交量,成交金额
6678,1990-12-20,'000001,上证指数,104.39,104.39,99.98,104.3,99.98,4.41,4.4109,197,84000.0
6677,1990-12-21,'000001,上证指数,109.13,109.13,103.73,109.07,104.39,4.74,4.5407,28,16000.0
6676,1990-12-24,'000001,上证指数,114.55,114.55,109.13,113.57,109.13,5.42,4.9666,32,31000.0
6675,1990-12-25,'000001,上证指数,120.25,120.25,114.55,120.09,114.55,5.7,4.976,15,6000.0
6674,1990-12-26,'000001,上证指数,125.27,125.27,120.25,125.27,120.25,5.02,4.1746,100,53000.0


## Data clean  
#### 1. Dropping some unnecessary feaures.
#### 2. There is a little number of datas equal to 'None', which locates in column '成交金额'.  We just set them as zero, since this number is very small.

In [3]:
df = df.drop(['日期'.decode('utf-8'), 
              '股票代码'.decode("utf-8"), 
              '名称'.decode("utf-8"), 
              '涨跌额'.decode("utf-8"), 
              '涨跌幅'.decode("utf-8"), 
              '前收盘'.decode('utf-8')],axis=1)
df = df.replace('None', 0)
df.head()

Unnamed: 0,收盘价,最高价,最低价,开盘价,成交量,成交金额
6678,104.39,104.39,99.98,104.3,197,84000.0
6677,109.13,109.13,103.73,109.07,28,16000.0
6676,114.55,114.55,109.13,113.57,32,31000.0
6675,120.25,120.25,114.55,120.09,15,6000.0
6674,125.27,125.27,120.25,125.27,100,53000.0


### 5 Day Moving Average
https://www.investopedia.com/articles/active-trading/052014/how-use-moving-average-buy-stocks.asp
##### But in there, we supposed each day has different weight, the price of today may has largest weight, if we forecast the price of tomorrow, intuitively. 

In [4]:
N = 5
df_new = df.copy()
for i in range(N):
    df_new = pd.concat([df_new, df.shift(i+1)], axis=1)
df = df_new[N:]
col = list(df.columns)
col[1] = '最高价1'.decode('utf-8')
col[2] = '最低价1'.decode('utf-8')
df.columns = col
df.head()

Unnamed: 0,收盘价,最高价1,最低价1,开盘价,成交量,成交金额,收盘价.1,最高价,最低价,开盘价.1,...,最低价.1,开盘价.2,成交量.1,成交金额.1,收盘价.2,最高价.1,最低价.2,开盘价.3,成交量.2,成交金额.2
6673,125.28,125.28,125.27,125.27,66,104000.0,125.27,125.27,120.25,125.27,...,103.73,109.07,28.0,16000.0,104.39,104.39,99.98,104.3,197.0,84000.0
6672,126.45,126.45,125.28,126.39,108,88000.0,125.28,125.28,125.27,125.27,...,109.13,113.57,32.0,31000.0,109.13,109.13,103.73,109.07,28.0,16000.0
6671,127.61,127.61,126.48,126.56,78,60000.0,126.45,126.45,125.28,126.39,...,114.55,120.09,15.0,6000.0,114.55,114.55,109.13,113.57,32.0,31000.0
6670,128.84,128.84,127.61,127.61,91,59000.0,127.61,127.61,126.48,126.56,...,120.25,125.27,100.0,53000.0,120.25,120.25,114.55,120.09,15.0,6000.0
6669,130.14,130.14,128.84,128.84,141,93000.0,128.84,128.84,127.61,127.61,...,125.27,125.27,66.0,104000.0,125.27,125.27,120.25,125.27,100.0,53000.0


### Because we want to forecast the low and high prices(indices) in a week, so set $$forecast\_out = 5$$

In [5]:
forecast_out = 5

In [6]:
def forecast(forecast_col, forecast_out):
    df['label'] = df[forecast_col].shift(-forecast_out)
    X = np.array(df.drop(['label'], axis=1)).astype(np.float32)
    # print X
    X = preprocessing.scale(X)
    X_lately = X[-forecast_out:]
    X = X[:-forecast_out]
    y = np.array(df['label'])[:-5].astype(np.float32)
    print 'X shape and y shape: ', X.shape, y.shape
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
    print 'X_train shape and y_train shape: ', X_train.shape, y_train.shape
    print 'X_test shape and y_test shape: ', X_test.shape, y_test.shape
    clf = LinearRegression(n_jobs=-1)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print 'confidence in test set: ', confidence
    print 'Forecasting the high price: \n'
    forecast_set = clf.predict(X_lately)
    print forecast_set
    y_groundtruth = np.asarray(df_original['最高价'.decode('utf-8')][:5][::-1]).astype(np.float32)
    print '\nThe ground truth is :\n\n', list(y_groundtruth)
    MSE = 0.2 * np.sqrt(np.sum((np.asarray(forecast_set) - y_groundtruth)**2))
    print '\nThe average of gap for each day is: ', MSE
    return forecast_set

# Forecasting the high price

In [7]:
forecast_col_high = '最高价1'.decode('utf-8')
forecast_high = forecast(forecast_col_high, forecast_out)

X shape and y shape:  (6669, 36) (6669,)
X_train shape and y_train shape:  (5335, 36) (5335,)
X_test shape and y_test shape:  (1334, 36) (1334,)
confidence in test set:  0.9922056232584354
Forecasting the high price: 

[3154.1753 3224.3228 3234.1099 3202.524  3183.0044]

The ground truth is :

[3146.0925, 3190.6492, 3220.8452, 3205.2522, 3197.896]

The average of gap for each day is:  8.010980987548828


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Forecasting the low price

In [8]:
forecast_col_low = '最低价1'.decode('utf-8')
forecast_low = forecast(forecast_col_low, forecast_out)

X shape and y shape:  (6669, 36) (6669,)
X_train shape and y_train shape:  (5335, 36) (5335,)
X_test shape and y_test shape:  (1334, 36) (1334,)
confidence in test set:  0.9924429891432935
Forecasting the high price: 

[3111.5835 3160.5635 3183.5469 3150.9846 3130.7825]

The ground truth is :

[3146.0925, 3190.6492, 3220.8452, 3205.2522, 3197.896]

The average of gap for each day is:  20.915432739257813


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Checking whether high prices are larger than low prices

In [9]:
print 'forecast_high - forecast_low: \n', [forecast_high[i]-forecast_low[i] for i in range(5)]

forecast_high - forecast_low: 
[42.591797, 63.759277, 50.56299, 51.539307, 52.221924]


In [61]:
print '最低价\n', forecast_low
print '最高价\n', forecast_high

最低价
[3106.521  3167.253  3184.285  3145.8984 3128.2612]
最高价
[3159.8875 3224.8262 3229.0786 3202.6182 3187.1047]
