# CS 282.01: Introduction to Machine Learning
## Spring 2018 Final Project Part 2: SSE Composite Index Case Study
## Author: Haofei Kuang, Yijun Yuan
## Task 1 Predict 4th day

In [1]:
# import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import sys

from sklearn.preprocessing import scale
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score

### Section 1: Data Pre-Proccess
In this section, we have several taskes for proccessing data, including:
- reading orignal data
- splite orignal data to features and labels
- data standardization 
- splite orignal data to training data and test data
- correlation analysis and feature selection(optional)

#### Reading Orignal Data 

In [2]:
orignal_data = pd.read_csv('dataset/01.csv', encoding='gbk', parse_dates=[0], index_col=0)
orignal_data.sort_index(0, ascending=True, inplace=True)
# orignal_data

#### Splite data to features and labels

In [3]:
data = orignal_data.copy()

# remove unknown data
samples = data.index.values.size
print('There are total %d samples' % samples)
for i in range(len(data.columns)):
    col_name = data.columns[i]
    unknow_data = data[data[col_name].isin(['None'])].index.values
    for j in range(unknow_data.size):
        data.drop(unknow_data[j], inplace=True)

print('There are %d samples are removed' % (samples - data.index.values.size))
samples = data.index.values.size

There are total 6713 samples
There are 11 samples are removed


In [4]:
# splite data into features and labels
features_days = 5
num_of_items = 9
num_of_features = features_days * num_of_items

# perdict day under hoistoty 5 days
# 0, 1, 2, 3, 4(after 1, 2, 3, 4, 5 day)
after_day = 3

features = np.zeros((samples - features_days - after_day, num_of_features))
labels_max = np.zeros((samples - features_days - after_day))
labels_min = np.zeros((samples - features_days - after_day))

for i in range(samples - features_days - after_day):
    features[i, 0:num_of_features] = np.array(data[i:i + features_days][
        [u'收盘价', u'最高价', u'最低价', u'开盘价', u'前收盘', u'涨跌额', u'涨跌幅', u'成交量', u'成交金额']]).reshape((1, num_of_features))
    labels_max[i] = data.iloc[i + features_days + after_day][u'最高价']
    labels_min[i] = data.iloc[i + features_days + after_day][u'最低价']

In [5]:
# data normalization
# features = scale(features)
# labels_max = scale(labels_max)
# labels_min = scale(labels_min)

from sklearn.preprocessing import StandardScaler

# define
scaler_features = StandardScaler()
scaler_max_labels = StandardScaler()
scaler_min_labels = StandardScaler()

# fit
scaler_features.fit(features)
scaler_max_labels.fit(labels_max.reshape(-1, 1))
scaler_min_labels.fit(labels_min.reshape(-1, 1))

# transform
features = scaler_features.transform(features)
labels_max = scaler_max_labels.transform(labels_max.reshape(-1, 1))
labels_min = scaler_min_labels.transform(labels_min.reshape(-1, 1))

labels_max = labels_max.ravel()
labels_min = labels_min.ravel()

In [6]:
# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

selector_max = SelectKBest(f_regression, k=23)
selector_min = SelectKBest(f_regression, k=23)

selector_max.fit(features, labels_max)
selector_min.fit(features, labels_min)

features_max = selector_max.transform(features)
features_min = selector_min.transform(features)
print(features.shape)
print(selector_max.get_support())
print(selector_min.get_support())

(6694, 45)
[ True  True  True False False False False False False  True  True  True
  True  True False False False False  True  True  True  True  True False
 False False False  True  True  True  True  True False False False False
  True  True  True  True  True False False False False]
[ True  True  True False False False False False False  True  True  True
  True  True False False False False  True  True  True  True  True False
 False False False  True  True  True  True  True False False False False
  True  True  True  True  True False False False False]


In [7]:
# using cross-validation to spilte features and labels into traing set and test set
# because the data is a time series data, so we cannot use general cross-validation methods to spilt data
# here use TimeSeries K-fold method to do it
tscv = TimeSeriesSplit(n_splits=10)

# splite dataset for predicting max values
for train_index, test_index in tscv.split(features_max):
    X_max_train, X_max_test = features_max[train_index], features_max[test_index]
    y_max_train, y_max_test = labels_max[train_index], labels_max[test_index]
    
# splite dataset for predicting min values
for train_index, test_index in tscv.split(features_min):
    X_min_train, X_min_test = features_min[train_index], features_min[test_index]
    y_min_train, y_min_test = labels_min[train_index], labels_min[test_index]

print(X_max_test.shape)
print(X_min_test.shape)

(608, 23)
(608, 23)


In [8]:
# extrac last 5 days data for predict new days' values
features_predict = np.array(data[-features_days:][
    [u'收盘价', u'最高价', u'最低价', u'开盘价', u'前收盘', u'涨跌额', u'涨跌幅', u'成交量', u'成交金额']]).reshape(
    (1, num_of_features))

features_predict = scaler_features.transform(features_predict)
x_max = selector_max.transform(features_predict)
x_min = selector_min.transform(features_predict)



### Section 2 Traning Model
According to our experiments, we found the ExtraTree Regression exihibit better performance of the problem
- ExtraTree Regression

In [9]:
# define a funtion to display the accuracy of model
def display_result(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    # The coefficients
#     print('Coefficients: \n', model.coef_)
    # The mean squared error
    print("Mean squared error: %f"
          % mean_squared_error(y_test, y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %f' % r2_score(y_test, y_pred))

#### Training with ExtraTreesRegressor

In [10]:
# ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor

reg_et_max = ExtraTreesRegressor(n_estimators=300, min_samples_split=10)
reg_et_min = ExtraTreesRegressor(n_estimators=300, min_samples_split=10)

start = time.clock()

print('Training....')

reg_et_max.fit(X_max_train, y_max_train)
reg_et_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Training....
Time used: 7.406156999999999


In [11]:
# display result of ExtraTree Regression
print('The result of ExtraTree Regression about predicting maximal price: ')
display_result(reg_et_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of ExtraTree Regression about predicting minimal price: ')
display_result(reg_et_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.1 
y_max = scaler_max_labels.inverse_transform(reg_et_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_et_min.predict(x_min))

print('2018.6.6 最高价: %0.2f RMB' % y_max)
print('2018.6.6 最低价: %0.2f RMB' % y_min)

The result of ExtraTree Regression about predicting maximal price: 
Mean squared error: 0.004146
Variance score: 0.852951
----------------------------------------------------------------
The result of ExtraTree Regression about predicting minimal price: 
Mean squared error: 0.005587
Variance score: 0.820521
----------------------------------------------------------------
2018.6.6 最高价: 3163.52 RMB
2018.6.6 最低价: 3114.01 RMB


In [13]:
# Ridge Regression
from sklearn.linear_model import Ridge

reg_ridge_max = Ridge(alpha=0.1)
reg_ridge_min = Ridge(alpha=0.1)

start = time.clock()
print('Traininig......')

reg_ridge_max.fit(X_max_train, y_max_train)
reg_ridge_min.fit(X_min_train, y_min_train)

elapsed = (time.clock() - start)
print("Time used:",elapsed)

Traininig......
Time used: 0.011752999999998792


In [14]:
# display result of Ridge Regression
print('The result of Ridge Regression about predicting maximal price: ')
display_result(reg_ridge_max, X_max_test, y_max_test)

print('----------------------------------------------------------------')

print('The result of Ridge Regression about predicting minimal price: ')
display_result(reg_ridge_min, X_min_test, y_min_test)

print('----------------------------------------------------------------')

# predict 2018.6.7
y_max = scaler_max_labels.inverse_transform(reg_ridge_max.predict(x_max))
y_min = scaler_min_labels.inverse_transform(reg_ridge_min.predict(x_min))

print('2018.6.7 最高价: %0.2f RMB' % y_max)
print('2018.6.7 最低价: %0.2f RMB' % y_min)

The result of Ridge Regression about predicting maximal price: 
Mean squared error: 0.003248
Variance score: 0.884803
----------------------------------------------------------------
The result of Ridge Regression about predicting minimal price: 
Mean squared error: 0.004376
Variance score: 0.859421
----------------------------------------------------------------
2018.6.7 最高价: 3092.95 RMB
2018.6.7 最低价: 3043.01 RMB
