In [128]:
import numpy as np 
import pandas as pd
import os

from sklearn import preprocessing, cross_validation
from sklearn.linear_model import LinearRegression

### View the data

In [129]:
DATA_PATH = os.path.join('data', 'ShanghaiCompositeIndex.csv')
df_original = pd.read_csv(DATA_PATH, encoding='gbk')[:-1]
# reverse the data
df = df_original.iloc[::-1]
df.head()
df.tail()

Unnamed: 0,日期,股票代码,名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,成交量,成交金额
4,2018-05-28,'000001,上证指数,3135.0821,3149.6646,3115.9585,3136.8092,3141.3032,-6.2211,-0.198,128755918,168191258410.0
3,2018-05-29,'000001,上证指数,3120.4605,3143.2076,3112.1531,3129.621,3135.0821,-14.6216,-0.4664,135717800,177826106449.0
2,2018-05-30,'000001,上证指数,3041.4434,3085.397,3041.0002,3081.1418,3120.4605,-79.0171,-2.5322,155115126,191995196670.0
1,2018-05-31,'000001,上证指数,3095.4737,3098.0764,3054.2685,3061.8291,3041.4434,54.0303,1.7765,140475532,185199037044.0
0,2018-06-01,'000001,上证指数,3075.1372,3102.088,3059.7857,3084.7536,3095.4737,-20.3365,-0.657,129872199,166548881781.0


## Data clean  
#### 1. Dropping some unnecessary feaures.
#### 2. There is a little number of datas equal to 'None', which locates in column '成交金额'.  We just set them as zero, since this number is very small.

In [130]:
df = df.drop(['日期'.decode('utf-8'), 
              '股票代码'.decode("utf-8"), 
              '名称'.decode("utf-8"), 
              '涨跌额'.decode("utf-8"), 
              '涨跌幅'.decode("utf-8"), 
              '前收盘'.decode('utf-8')],axis=1)
df = df.replace('None', 0)
df.head()

Unnamed: 0,收盘价,最高价,最低价,开盘价,成交量,成交金额
6711,104.39,104.39,99.98,104.3,197,84000.0
6710,109.13,109.13,103.73,109.07,28,16000.0
6709,114.55,114.55,109.13,113.57,32,31000.0
6708,120.25,120.25,114.55,120.09,15,6000.0
6707,125.27,125.27,120.25,125.27,100,53000.0


### 5 Day Moving Average
https://www.investopedia.com/articles/active-trading/052014/how-use-moving-average-buy-stocks.asp
##### But in there, we supposed each day has different weight, the price of today may has largest weight, if we forecast the price of tomorrow, intuitively. 

In [131]:
N = 4
df_new = df.copy()
for i in range(N):
    df_new = pd.concat([df_new, df.shift(-i-1)], axis=1)
df = df_new[:-N]
col = list(df.columns)
col[-5] = '最高价1'.decode('utf-8')
col[-4] = '最低价1'.decode('utf-8')
df.columns = col
df.head()

Unnamed: 0,收盘价,最高价,最低价,开盘价,成交量,成交金额,收盘价.1,最高价.1,最低价.1,开盘价.1,...,最低价.2,开盘价.2,成交量.1,成交金额.1,收盘价.2,最高价1,最低价1,开盘价.3,成交量.2,成交金额.2
6711,104.39,104.39,99.98,104.3,197,84000.0,109.13,109.13,103.73,109.07,...,114.55,120.09,15.0,6000.0,125.27,125.27,120.25,125.27,100.0,53000.0
6710,109.13,109.13,103.73,109.07,28,16000.0,114.55,114.55,109.13,113.57,...,120.25,125.27,100.0,53000.0,125.28,125.28,125.27,125.27,66.0,104000.0
6709,114.55,114.55,109.13,113.57,32,31000.0,120.25,120.25,114.55,120.09,...,125.27,125.27,66.0,104000.0,126.45,126.45,125.28,126.39,108.0,88000.0
6708,120.25,120.25,114.55,120.09,15,6000.0,125.27,125.27,120.25,125.27,...,125.28,126.39,108.0,88000.0,127.61,127.61,126.48,126.56,78.0,60000.0
6707,125.27,125.27,120.25,125.27,100,53000.0,125.28,125.28,125.27,125.27,...,126.48,126.56,78.0,60000.0,128.84,128.84,127.61,127.61,91.0,59000.0


In [132]:
df.tail()

Unnamed: 0,收盘价,最高价,最低价,开盘价,成交量,成交金额,收盘价.1,最高价.1,最低价.1,开盘价.1,...,最低价.2,开盘价.2,成交量.1,成交金额.1,收盘价.2,最高价1,最低价1,开盘价.3,成交量.2,成交金额.2
8,3214.3497,3214.5888,3192.2277,3211.247,144292684,185721667752.0,3168.9642,3205.437,3168.9642,3205.437,...,3131.0675,3148.41,128610843.0,166554049935.0,3135.0821,3149.6646,3115.9585,3136.8092,128755918.0,168191258410.0
7,3168.9642,3205.437,3168.9642,3205.437,157807648,199358101015.0,3154.6506,3173.5309,3152.0692,3167.9391,...,3115.9585,3136.8092,128755918.0,168191258410.0,3120.4605,3143.2076,3112.1531,3129.621,135717800.0,177826106449.0
6,3154.6506,3173.5309,3152.0692,3167.9391,124085800,160658185502.0,3141.3032,3156.7261,3131.0675,3148.41,...,3112.1531,3129.621,135717800.0,177826106449.0,3041.4434,3085.397,3041.0002,3081.1418,155115126.0,191995196670.0
5,3141.3032,3156.7261,3131.0675,3148.41,128610843,166554049935.0,3135.0821,3149.6646,3115.9585,3136.8092,...,3041.0002,3081.1418,155115126.0,191995196670.0,3095.4737,3098.0764,3054.2685,3061.8291,140475532.0,185199037044.0
4,3135.0821,3149.6646,3115.9585,3136.8092,128755918,168191258410.0,3120.4605,3143.2076,3112.1531,3129.621,...,3054.2685,3061.8291,140475532.0,185199037044.0,3075.1372,3102.088,3059.7857,3084.7536,129872199.0,166548881781.0


### Because we want to forecast the low and high prices(indices) in a week, so set $$forecast\_out = 5$$

In [133]:
forecast_out = 5

In [134]:
def forecast(forecast_col, forecast_out):
    df['label'] = df[forecast_col].shift(-forecast_out)
    X = np.array(df.drop(['label'], axis=1)).astype(np.float32)
    # print X
    X = preprocessing.scale(X)
    X_val = X[-2*forecast_out:-forecast_out]
    X_5 = X[-forecast_out:]
    
    X = X[:-2*forecast_out]
    y = np.array(df['label'])[:-2*forecast_out].astype(np.float32)
    y_val = np.array(df['label'])[-2*forecast_out:-forecast_out].astype(np.float32)
    print 'X shape and y shape: ', X.shape, y.shape
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
    print 'X_train shape and y_train shape: ', X_train.shape, y_train.shape
    print 'X_test shape and y_test shape: ', X_test.shape, y_test.shape
    clf = LinearRegression(n_jobs=-1)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print 'confidence in test set: ', confidence
    print 'y_val_pred the price: \n'
    y_val_pred = clf.predict(X_val)
    print y_val_pred
    print '\nThe y_val is :\n\n', list(y_val)
    MSE = 0.2 * np.sqrt(np.sum((np.asarray(y_val_pred) - y_val)**2))
    print '\nThe average of gap for each day is: ', MSE
    print '\n The forecast the next five days:\n'
    forecast_set = clf.predict(X_5)
    print list(forecast_set)
    return forecast_set

# Forecasting the high price

In [135]:
forecast_col_high = '最高价1'.decode('utf-8')
forecast_high = forecast(forecast_col_high, forecast_out)

X shape and y shape:  (6698, 30) (6698,)
X_train shape and y_train shape:  (5358, 30) (5358,)
X_test shape and y_test shape:  (1340, 30) (1340,)
confidence in test set:  0.9934125742348064
y_val_pred the price: 

[3228.209  3213.3618 3169.1458 3157.9163 3149.2256]

The y_val is :

[3149.6646, 3143.2075, 3085.397, 3098.0764, 3102.088]

The average of gap for each day is:  30.924087524414062

 The forecast the next five days:

[3140.6753, 3131.1768, 3035.664, 3110.1226, 3078.4883]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Forecasting the low price

In [136]:
forecast_col_low = '最低价1'.decode('utf-8')
forecast_low = forecast(forecast_col_low, forecast_out)

X shape and y shape:  (6698, 30) (6698,)
X_train shape and y_train shape:  (5358, 30) (5358,)
X_test shape and y_test shape:  (1340, 30) (1340,)
confidence in test set:  0.9921201066859824
y_val_pred the price: 

[3181.7468 3171.7676 3128.4106 3120.7886 3107.4355]

The y_val is :

[3115.9585, 3112.153, 3041.0002, 3054.2686, 3059.7856]

The average of gap for each day is:  29.811404418945315

 The forecast the next five days:

[3096.836, 3087.8623, 2996.0542, 3067.379, 3032.6873]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Checking whether high prices are larger than low prices

In [137]:
print 'forecast_high - forecast_low: \n', [forecast_high[i]-forecast_low[i] for i in range(5)]

forecast_high - forecast_low: 
[43.839355, 43.314453, 39.609863, 42.743652, 45.801025]


In [138]:
print '最低价\n', forecast_low
print '最高价\n', forecast_high

最低价
[3096.836  3087.8623 2996.0542 3067.379  3032.6873]
最高价
[3140.6753 3131.1768 3035.664  3110.1226 3078.4883]
