# CS 282.01: Introduction to Machine Learning
## Spring 2018 Final Project Part 2: SSE Composite Index Case Study
## Author: Haofei Kuang, Yijun Yuan

In [58]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys

from sklearn.preprocessing import scale
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score

### Section 1: Data Pre-Proccess
In this section, we have several taskes for proccessing data, including:
- reading orignal data
- splite orignal data to features and labels
- splite orignal data to training data and test data
- correlation analysis and feature selection(optional)

#### Reading Orignal Data 

In [59]:
orignal_data = pd.read_csv('dataset/sse.csv', encoding='gbk', parse_dates=[0], index_col=0)
orignal_data.sort_index(0, ascending=True, inplace=True)
orignal_data

Unnamed: 0_level_0,股票代码,名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,成交量,成交金额
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1990-12-19,'000001,上证指数,99.9800,99.9800,95.7900,96.0500,,,,1260,494000.0
1990-12-20,'000001,上证指数,104.3900,104.3900,99.9800,104.3000,99.98,4.41,4.4109,197,84000.0
1990-12-21,'000001,上证指数,109.1300,109.1300,103.7300,109.0700,104.39,4.74,4.5407,28,16000.0
1990-12-24,'000001,上证指数,114.5500,114.5500,109.1300,113.5700,109.13,5.42,4.9666,32,31000.0
1990-12-25,'000001,上证指数,120.2500,120.2500,114.5500,120.0900,114.55,5.7,4.976,15,6000.0
1990-12-26,'000001,上证指数,125.2700,125.2700,120.2500,125.2700,120.25,5.02,4.1746,100,53000.0
1990-12-27,'000001,上证指数,125.2800,125.2800,125.2700,125.2700,125.27,0.01,0.008,66,104000.0
1990-12-28,'000001,上证指数,126.4500,126.4500,125.2800,126.3900,125.28,1.17,0.9339,108,88000.0
1990-12-31,'000001,上证指数,127.6100,127.6100,126.4800,126.5600,126.45,1.16,0.9174,78,60000.0
1991-01-02,'000001,上证指数,128.8400,128.8400,127.6100,127.6100,127.61,1.23,0.9639,91,59000.0


#### Splite data to features and labels

In [60]:
data = orignal_data.copy()

# remove unknown data
samples = data.index.values.size
print('There are total %d samples' % samples)
for i in range(len(data.columns)):
    col_name = data.columns[i]
    unknow_data = data[data[col_name].isin(['None'])].index.values
    for j in range(unknow_data.size):
        data.drop(unknow_data[j], inplace=True)

print('There are %d samples are removed' % (samples - data.index.values.size))
samples = data.index.values.size

There are total 6712 samples
There are 11 samples are removed


In [61]:
# splite data into features and labels
features_days = 5
num_of_items = 9
num_of_features = features_days * num_of_items

features = np.zeros((samples - features_days, num_of_features))
labels_max = np.zeros((samples - features_days))
labels_min = np.zeros((samples - features_days))

for i in range(samples - features_days):
    features[i, 0:num_of_features] = np.array(data[i:i + features_days][
        [u'收盘价', u'最高价', u'最低价', u'开盘价', u'前收盘', u'涨跌额', u'涨跌幅', u'成交量', u'成交金额']]).reshape((1, num_of_features))
    labels_max[i] = data.iloc[i + features_days][u'最高价']
    labels_min[i] = data.iloc[i + features_days][u'最低价']

In [62]:
# data normalization
# features = scale(features)
# labels_max = scale(labels_max)
# labels_min = scale(labels_min)

from sklearn.preprocessing import StandardScaler

# define
scaler_features = StandardScaler()
scaler_max_labels = StandardScaler()
scaler_min_labels = StandardScaler()

# fit
scaler_features.fit(features)
scaler_max_labels.fit(labels_max.reshape(-1, 1))
scaler_min_labels.fit(labels_min.reshape(-1, 1))

# transform
features = scaler_features.transform(features)
labels_max = scaler_max_labels.transform(labels_max.reshape(-1, 1))
labels_min = scaler_min_labels.transform(labels_min.reshape(-1, 1))

labels_max = labels_max.ravel()
labels_min = labels_min.ravel()

In [63]:
# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

selector_max = SelectKBest(f_regression, k=23)
selector_min = SelectKBest(f_regression, k=23)

selector_max.fit(features, labels_max)
selector_min.fit(features, labels_min)

features_max = selector_max.transform(features)
features_min = selector_min.transform(features)
print(features.shape)
print(selector_max.get_support())
print(selector_min.get_support())

(6696, 45)
[ True  True  True False False False False False False  True  True  True
  True  True False False False False  True  True  True  True  True False
 False False False  True  True  True  True  True False False False False
  True  True  True  True  True False False False False]
[ True  True  True False False False False False False  True  True  True
  True  True False False False False  True  True  True  True  True False
 False False False  True  True  True  True  True False False False False
  True  True  True  True  True False False False False]


In [99]:
features_name_max = []
max_index = selector_max.get_support()

features_name_min = []
min_index = selector_min.get_support()

for i in range(max_index.size):
    if max_index[i] == True:
        day_index = int(i / num_of_items) + 1
        f = str(day_index) + ' ' + data.columns[i % num_of_items + 2]
        features_name_max.append(f)

for i in range(min_index.size):
    if min_index[i] == True:
        day_index = int(i / num_of_items) + 1
        f = str(day_index) + ' ' + data.columns[i % num_of_items + 2]
        features_name_min.append(f)
print('max values features: ', features_name_max)
print('--------------------------------------------------------------------')
print('min values features: ', features_name_min)

max values features:  ['1 收盘价', '1 最高价', '1 最低价', '2 收盘价', '2 最高价', '2 最低价', '2 开盘价', '2 前收盘', '3 收盘价', '3 最高价', '3 最低价', '3 开盘价', '3 前收盘', '4 收盘价', '4 最高价', '4 最低价', '4 开盘价', '4 前收盘', '5 收盘价', '5 最高价', '5 最低价', '5 开盘价', '5 前收盘']
--------------------------------------------------------------------
min values features:  ['1 收盘价', '1 最高价', '1 最低价', '2 收盘价', '2 最高价', '2 最低价', '2 开盘价', '2 前收盘', '3 收盘价', '3 最高价', '3 最低价', '3 开盘价', '3 前收盘', '4 收盘价', '4 最高价', '4 最低价', '4 开盘价', '4 前收盘', '5 收盘价', '5 最高价', '5 最低价', '5 开盘价', '5 前收盘']


In [64]:
# using cross-validation to spilte features and labels into traing set and test set
# because the data is a time series data, so we cannot use general cross-validation methods to spilt data
# here use TimeSeries K-fold method to do it
tscv = TimeSeriesSplit(n_splits=10)

# splite dataset for predicting max values
for train_index, test_index in tscv.split(features_max):
    X_max_train, X_max_test = features_max[train_index], features_max[test_index]
    y_max_train, y_max_test = labels_max[train_index], labels_max[test_index]
    
# splite dataset for predicting min values
for train_index, test_index in tscv.split(features_min):
    X_min_train, X_min_test = features_min[train_index], features_min[test_index]
    y_min_train, y_min_test = labels_min[train_index], labels_min[test_index]

print(X_max_test.shape)
print(X_min_test.shape)

(608, 23)
(608, 23)


In [65]:
# extrac last 5 days data for predict new days' values
features_predict = np.array(data[-features_days:][
    [u'收盘价', u'最高价', u'最低价', u'开盘价', u'前收盘', u'涨跌额', u'涨跌幅', u'成交量', u'成交金额']]).reshape(
    (1, num_of_features))

features_predict = scaler_features.transform(features_predict)
x_max = selector_max.transform(features_predict)
x_min = selector_min.transform(features_predict)



### Section 2 Traning Model
Select some regression algorithm to trainning a model by using training data
- Linear Models
    - Linear Regression 
    - Ridge Regression
- Descision Tree
    - Random Forest Regression
    - Extra Tres
- SVM
    - SVR

In [66]:
# define a funtion to display the accuracy of model
def display_result(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    # The coefficients
#     print('Coefficients: \n', model.coef_)
    # The mean squared error
    print("Mean squared error: %f"
          % mean_squared_error(y_test, y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %f' % r2_score(y_test, y_pred))

#### Traning with linear models

In [67]:
# Linear Regression Traning
from sklearn.linear_model import LinearRegression

reg_lr_max = LinearRegression()
reg_lr_min = LinearRegression()

start = time.clock()
print('Traininig......')

reg_lr_max.fit(X_max_train, y_max_train)
reg_lr_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Traininig......
Time used: 0.015264000000001943


In [68]:
# display result of Linear Regression
print('The result of Linear Regression about predicting maximal price: ')
display_result(reg_lr_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of Linear Regression about predicting minimal price: ')
display_result(reg_lr_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.1
y_max = scaler_max_labels.inverse_transform(reg_lr_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_lr_min.predict(x_min))

print('2018.6.1 最高价: %0.2f RMB' % y_max)
print('2018.6.1 最低价: %0.2f RMB' % y_min)

The result of Linear Regression about predicting maximal price: 
Mean squared error: 0.000391
Variance score: 0.986237
----------------------------------------------------------------
The result of Linear Regression about predicting minimal price: 
Mean squared error: 0.000711
Variance score: 0.977276
----------------------------------------------------------------
2018.6.1 最高价: 3121.75 RMB
2018.6.1 最低价: 3077.26 RMB


In [69]:
# Ridge Regression
from sklearn.linear_model import Ridge

reg_ridge_max = Ridge(alpha=0.1)
reg_ridge_min = Ridge(alpha=0.1)

start = time.clock()
print('Traininig......')

reg_ridge_max.fit(X_max_train, y_max_train)
reg_ridge_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Traininig......
Time used: 0.011926000000002546


In [70]:
# display result of Ridge Regression
print('The result of Ridge Regression about predicting maximal price: ')
display_result(reg_ridge_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of Ridge Regression about predicting minimal price: ')
display_result(reg_ridge_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.1
y_max = scaler_max_labels.inverse_transform(reg_ridge_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_ridge_min.predict(x_min))

print('2018.6.1 最高价: %0.2f RMB' % y_max)
print('2018.6.1 最低价: %0.2f RMB' % y_min)

The result of Ridge Regression about predicting maximal price: 
Mean squared error: 0.000389
Variance score: 0.986318
----------------------------------------------------------------
The result of Ridge Regression about predicting minimal price: 
Mean squared error: 0.000708
Variance score: 0.977376
----------------------------------------------------------------
2018.6.1 最高价: 3121.72 RMB
2018.6.1 最低价: 3077.22 RMB


#### Decision Tree Models

In [71]:
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

reg_rf_max = RandomForestRegressor(n_estimators=300)
reg_rf_min = RandomForestRegressor(n_estimators=300)

start = time.clock()

print('Training....')

reg_rf_max.fit(X_max_train, y_max_train)
reg_rf_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Training....
Time used: 34.223062


In [72]:
# display result of Random Forest Regression
print('The result of Random Forest Regression about predicting maximal price: ')
display_result(reg_rf_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of Random Forest Regression about predicting minimal price: ')
display_result(reg_rf_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.1 
y_max = scaler_max_labels.inverse_transform(reg_rf_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_rf_min.predict(x_min))

print('2018.6.1 最高价: %0.2f RMB' % y_max)
print('2018.6.1 最低价: %0.2f RMB' % y_min)

The result of Random Forest Regression about predicting maximal price: 
Mean squared error: 0.000628
Variance score: 0.977872
----------------------------------------------------------------
The result of Random Forest Regression about predicting minimal price: 
Mean squared error: 0.001198
Variance score: 0.961745
----------------------------------------------------------------
2018.6.1 最高价: 3125.51 RMB
2018.6.1 最低价: 3067.52 RMB


In [73]:
# ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor

reg_et_max = ExtraTreesRegressor(n_estimators=300, min_samples_split=10)
reg_et_min = ExtraTreesRegressor(n_estimators=300, min_samples_split=10)

start = time.clock()

print('Training....')

reg_et_max.fit(X_max_train, y_max_train)
reg_et_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Training....
Time used: 6.193400999999994


In [74]:
# display result of ExtraTree Regression
print('The result of ExtraTree Regression about predicting maximal price: ')
display_result(reg_et_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of ExtraTree Regression about predicting minimal price: ')
display_result(reg_et_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.1 
y_max = scaler_max_labels.inverse_transform(reg_et_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_et_min.predict(x_min))

print('2018.6.1 最高价: %0.2f RMB' % y_max)
print('2018.6.1 最低价: %0.2f RMB' % y_min)

The result of ExtraTree Regression about predicting maximal price: 
Mean squared error: 0.000648
Variance score: 0.977181
----------------------------------------------------------------
The result of ExtraTree Regression about predicting minimal price: 
Mean squared error: 0.001062
Variance score: 0.966087
----------------------------------------------------------------
2018.6.1 最高价: 3124.20 RMB
2018.6.1 最低价: 3065.71 RMB


#### SVM

In [75]:
# SVR
from sklearn.svm import SVR

reg_svr_max = SVR(C=1, epsilon=0.2, kernel='linear')
reg_svr_min = SVR(C=0.01, epsilon=0.5, kernel='linear')

start = time.clock()

print('Training....')

reg_svr_max.fit(X_max_train, y_max_train)
reg_svr_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Training....
Time used: 0.08084900000000061


In [76]:
# display result of SVR
print('The result of SVR Regression about predicting maximal price: ')
display_result(reg_svr_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of SVR Regression about predicting minimal price: ')
display_result(reg_svr_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.1
y_max = scaler_max_labels.inverse_transform(reg_svr_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_svr_min.predict(x_min))

print('2018.6.1 最高价: %0.2f RMB' % y_max)
print('2018.6.1 最低价: %0.2f RMB' % y_min)

The result of SVR Regression about predicting maximal price: 
Mean squared error: 0.004593
Variance score: 0.838300
----------------------------------------------------------------
The result of SVR Regression about predicting minimal price: 
Mean squared error: 0.004537
Variance score: 0.855062
----------------------------------------------------------------
2018.6.1 最高价: 3166.94 RMB
2018.6.1 最低价: 3047.08 RMB
