In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import score
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingRegressor
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [10]:
df = pd.read_csv('etc.csv',parse_dates=True, index_col=('date'))
df.dropna(inplace=True)
df.rename(columns={'date':'Date','PriceUSD':'Close'}, inplace=True)
# df = df[['FeeMedUSD','TxTfrValMedUSD','Close','IssTotUSD','VtyDayRet30d','TxTfrValMeanNtv']]
df = df[['FeeMedUSD','TxTfrValMedUSD','Close']]

futures = df.tail()
futures

Unnamed: 0_level_0,FeeMedUSD,TxTfrValMedUSD,Close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-11-03,0.000103,4.879751,4.885147
2019-11-04,0.000104,4.957563,4.959801
2019-11-05,0.000105,5.018602,5.021905
2019-11-06,0.000107,5.07521,5.077399
2019-11-07,0.00011,5.224026,5.225557


In [11]:
X=df[:-5].copy()
# X=X[['FeeMedUSD','TxTfrValMedUSD','Close','IssTotUSD','VtyDayRet30d','TxTfrValMeanNtv']]

In [12]:
y = df["Close"][5:].values.reshape(-1, 1)
y[:5]

array([[13.77123374],
       [13.7885407 ],
       [14.56510615],
       [14.46664599],
       [15.2493882 ]])

In [13]:
X.shape

(831, 3)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Choose learning rate
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    model = GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=learning_rate,
        max_features=2,
        max_depth=3,
        random_state=0)
    model.fit(X_train_scaled,y_train.ravel())
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        model.score(
            X_train_scaled,
            y_train.ravel())))
    print("Accuracy score (validation): {0:.3f}".format(
        model.score(
            X_test_scaled,
            y_test.ravel())))
    print()

Learning rate:  0.05
Accuracy score (training): 0.960
Accuracy score (validation): 0.915

Learning rate:  0.1
Accuracy score (training): 0.971
Accuracy score (validation): 0.914

Learning rate:  0.25
Accuracy score (training): 0.990
Accuracy score (validation): 0.897

Learning rate:  0.5
Accuracy score (training): 0.997
Accuracy score (validation): 0.894

Learning rate:  0.75
Accuracy score (training): 0.999
Accuracy score (validation): 0.878

Learning rate:  1
Accuracy score (training): 0.999
Accuracy score (validation): 0.885



In [9]:
# Create GradientBoostingClassifier model
model = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=.25,
    max_features=5,
    max_depth=3,
    random_state=0)

# Fit the model
model.fit(X_train_scaled,y_train.ravel())

# Score the model
print("Accuracy score (training): {0:.3f}".format(
    model.score(
        X_train_scaled,
        y_train)))
print("Accuracy score (validation): {0:.3f}".format(
    model.score(
        X_test_scaled,
        y_test)))

ValueError: max_features must be in (0, n_features]

In [None]:
# Make predictions
predictions = model.predict(X_test_scaled)
y_test = y_test.ravel()
actual = pd.Series(y_test)


In [None]:
pd.DataFrame(predictions, actual).head()

In [None]:
X.tail()

In [None]:
X_future = X_scaler.transform(X.tail(5))
predictions = model.predict(X_future)

In [None]:
actual=futures['Close'].ravel()
dates=futures.reset_index()
dates=dates['date'].tail(5).ravel()
predictions.shape

In [None]:
pd.DataFrame({'Predicted':predictions,'Actual':actual, 'Date':dates,'Accuracy':(predictions/actual)})

In [None]:
predict_me=df.tail().drop('Close',axis=1)
real_future = X_scaler.transform(predict_me)
predictions = model.predict(real_future)

In [None]:
predictions

Based on my predictions today, (Nov 7th)- I should sell my stock tomorrow (Nov 8th)


In [None]:

datelist = pd.date_range(pd.datetime.today(), periods=5).tolist()
future_dates=[]
for date in datelist:
    future_dates.append(date.strftime("%m-%d-%Y"))

In [None]:
pd.DataFrame({'Date':future_dates,'Predictions':predictions})

In [None]:
fold_importance_df=pd.DataFrame()
fold_importance_df['Feature']=predict_me.columns
feature_importance_df=pd.DataFrame()
fold_importance_df["importance"]=model.feature_importances_[:len(df.tail(5).columns)]
feature_importance_df=pd.concat([feature_importance_df, fold_importance_df], axis=0)
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:200].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(10,10))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (averaged over folds)')
plt.tight_layout()

plt.savefig('lgbm_importances.png')