## 1.0 介绍, 数据

In [None]:
import pandas as pd
import Quandl

df = Quandl.get('WIKI/GOOGL')

df.head()

In [None]:
df.shape

In [None]:
df.tail(3)

In [None]:
df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]

In [None]:
df.shape

## 2.数据特征和标签

In [None]:
# high minus low percent
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
# the percent change volatility
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

In [None]:
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]

In [None]:
df.head()

In [None]:
df.tail(3)

In [None]:
import math

# 预测未来股价
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
# 预测的天数为过去总天数的1%
forecast_out = int(math.ceil(0.01 * len(df)))
# 标签是未来的股价
df['label'] = df[forecast_col].shift(-forecast_out)

In [None]:
df.tail()

In [None]:
df.dropna(inplace=True)

In [None]:
df.tail()

In [None]:
df.head()

In [None]:
a = {'one': [1., 2., 3., 4, 5.], 
     'two': [5., 4., 3., 2., 1.]}

a = pd.DataFrame(a)

In [None]:
a

In [None]:
forecast_col = 'one'
a.fillna(-999, inplace=True)
forecast_out = 1
a['label'] = a[forecast_col].shift(-forecast_out)

In [None]:
a

## 3.训练和测试

In [None]:
dateset = df.values

In [None]:
dateset.shape

In [None]:
from sklearn.preprocessing import scale
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

import numpy as np

# 准备数据
X = dateset[:, 0:4].astype(float)
y = dateset[:, 4]
# 训练样本特征标准化
X = scale(X)

# 训练集和测试集划分
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SVM分类器
clf = SVR()
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print("Accuracy: {0:.2f}%".format(confidence*100))

In [None]:
# 线性回归分类
clf = LinearRegression()
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print("Accuracy: {0:.2f}%".format(confidence*100))

In [None]:
X.shape

In [None]:
# svm 不同的核函数
for k in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print("Accuracy of {0}: {1:.2f}%".format(k, confidence*100))

## 4.预测

In [1]:
import Quandl, math
import numpy as np
import pandas as pd

from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression

df = Quandl.get('WIKI/GOOGL')
df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]
# high minus low percent
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
# the percent change volatility
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
# 预测未来股价
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
# 预测的天数为过去总天数的1%
forecast_out = int(math.ceil(0.01 * len(df)))
# 标签是未来的股价
df['label'] = df[forecast_col].shift(-forecast_out)



In [2]:
df.shape

(3274, 5)

In [3]:
dataset = df.values

In [4]:
dataset.shape

(3274L, 5L)

In [5]:
df.shape

(3274, 5)

In [6]:
# 提取数据
X = dataset[:, 0:4].astype(float)
X.shape

(3274L, 4L)

In [7]:
# 处理所有数据
X = preprocessing.scale(X)
# 未知价格的数据特征
X_lately = X[-forecast_out:]
# 已知数据样本集(特征)
X = X[:-forecast_out]
X.shape

(3241L, 4L)

In [8]:
X_lately.shape

(33L, 4L)

In [9]:
forecast_out

33

In [10]:
# 已知数据样本集标签
y = dataset[:, 4]
y.shape

(3274L,)

In [11]:
y = y[:-forecast_out]
y.shape

(3241L,)

In [12]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
                                X, y, test_size=0.2)

# 分类器
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print("Accuracy: {:.2f}%".format(confidence*100))

Accuracy: 97.48%


In [13]:
forecast_set = clf.predict(X_lately)

In [14]:
print(forecast_set)
print("Accuracy: {:.2f}%".format(confidence*100))
print("预测样本数目: {}".format(forecast_out))

[  944.26776496   939.85190309   953.38348392   964.14954499   966.80126128
   980.58743969   981.42108695   990.67295057   988.83026439   999.27056619
  1006.29554334  1004.95409192  1007.08547972  1010.38241723   980.41202897
   977.75831007   961.87014973   970.69036635   957.03258855   959.88725645
   959.77188879   953.46535171   959.38389397   959.53872232   957.3800837
   953.32527037   935.59891047   942.97082734   952.66624921   951.85017127
   957.54644243   939.76578801   939.68442485]
Accuracy: 97.48%
预测样本数目: 33


In [15]:
df.shape

(3274, 5)

In [16]:
df.iloc[-1]

Adj. Close     9.261800e+02
HL_PCT         8.173357e-01
PCT_change    -8.630175e-02
Adj. Volume    1.327288e+06
label                   NaN
Name: 2017-08-18 00:00:00, dtype: float64

In [18]:
# plot
import datetime
import matplotlib.pyplot as plt
from matplotlib import style

style.use('ggplot')

df.dropna(inplace=True)

df['Forecast'] = np.nan

last_date = df.iloc[-1].name
last_unix = last_date.timestamp()
# 一天 3600 * 24 = 86400
one_day = 86400 
next_unix = last_unix + one_day

for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unit)
    next_unix += 86400
    df.loc[next_date] = [np.nan for _ in range(len(df.columns)-1)]+[i]    

IndexError: single positional indexer is out-of-bounds

python 2.x 的 datetime 无 timestamp
而 python 3.x 的就有.