In [1]:
# Imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import tensorflow as  tf
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error

  from ._conv import register_converters as _register_converters


In [2]:
financial_data = pd.read_csv("company_statistics.csv")
financial_data.head()

Unnamed: 0,Ticker,Name,Sector,Industry,IPO Year,Price,Market Cap,Trailing P/E,Forward P/E,PEG Ratio(5yr Expected),...,Operating Cash Flow(TTM),Levered Free Cash Flow(TTM),Beta(3Y Monthly),Shares Outstanding,Forward Annual Dividend Rate,Forward Annual Dividend Yield,Trailing Annual Dividend Rate,Trailing Annual Dividend Yield,5 Year Average Dividend Yield,Payout Ratio
0,PIH,"1347 Property Insurance Holdings, Inc.",Finance,Property-Casualty Insurers,2014.0,5.0,30304000.0,,4.38,,...,24790000.0,26710000.0,0.97,6010000.0,,,,,,0.0
1,TURN,180 Degree Capital Corp.,Finance,Finance/Investors Services,,1.95,61310000.0,61.56,,,...,4350000.0,-2310000.0,0.76,31120000.0,,,,,,0.0
2,FLWS,"1-800 FLOWERS.COM, Inc.",Consumer Services,Other Specialty Stores,1999.0,20.24,1311000000.0,38.69,34.63,4.1,...,90770000.0,42390000.0,1.12,35620000.0,,,,,,0.0
3,FCCY,1st Constitution Bancorp (NJ),Finance,Savings Institutions,,18.82,163947000.0,13.1,12.1,1.51,...,,,0.2,8630000.0,0.3,1.6,0.28,1.51,,18.21
4,SRCE,1st Source Corporation,Finance,Major Banks,,48.05,1233000000.0,14.64,12.81,1.34,...,142330000.0,,1.5,25670000.0,1.08,2.31,1.04,2.14,1.95,30.7


Lets form our X matrix, which will consist of company statistics

In [3]:
to_remove = ['Ticker', 'Name', 'Sector', 'Industry', 'IPO Year', 'Price']
cols = list(financial_data.columns)
for item in to_remove:
    cols.remove(item)
X = financial_data[cols].values
print(X)

[[ 3.03040e+07          nan  4.38000e+00 ...          nan          nan
   0.00000e+00]
 [ 6.13100e+07  6.15600e+01          nan ...          nan          nan
   0.00000e+00]
 [ 1.31100e+09  3.86900e+01  3.46300e+01 ...          nan          nan
   0.00000e+00]
 ...
 [ 2.31987e+08          nan          nan ...          nan          nan
   0.00000e+00]
 [ 1.18280e+10  8.39000e+00  1.80700e+01 ...          nan          nan
   0.00000e+00]
 [ 1.86427e+08          nan -2.11400e+01 ...          nan          nan
   0.00000e+00]]


Now lets form our Y matrix, which will consist of the company prices that the model is going to learn. 

In [4]:
Y = financial_data['Price'].values
print(Y)

[ 5.    1.95 20.24 ...  3.5  36.69  4.55]


Now lets train an XGBRegressor Model on the data

In [7]:
data_dmatrix = xgb.DMatrix(data=X,label=Y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [9]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [10]:
xg_reg.fit(X_train,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

Now lets see how we did on the test set

In [11]:
preds = xg_reg.predict(X_test)

In [12]:
rmse = np.sqrt(mean_squared_error(y_test, preds))

In [13]:
print("RMSE: %f" % (rmse))

RMSE: 47.579664


## Using XGBoost to Predict stock market prices given a companies summary statistics

In [17]:
from market import *

In [33]:
def predict_price(ticker):
    attributes = ['Market Cap (intraday)','Trailing P/E','Forward P/E','PEG Ratio (5 yr expected)','Price/Sales','Price/Book',
                  'Enterprise Value/Revenue','Enterprise Value/EBITDA','Profit Margin','Operating Margin',
                  'Return on Assets','Return on Equity','Revenue','Revenue Per Share',
                  'Quarterly Revenue Growth','Gross Profit','EBITDA','Diluted EPS',
                  'Quarterly Earnings Growth','Total Cash','Total Cash Per Share','Total Debt',
                  'Total Debt/Equity','Current Ratio','Book Value Per Share','Operating Cash Flow',
                  'Levered Free Cash Flow','Beta (3Y Monthly)','Shares Outstanding','Forward Annual Dividend Rate',
                  'Forward Annual Dividend Yield','Trailing Annual Dividend Rate','Trailing Annual Dividend Yield',
                  '5 Year Average Dividend Yield','Payout Ratio']
    stats = get_summary_statistics(ticker)
    x = []
    for a in attributes:
        x.append(str_to_num(stats[a]))
    price = xg_reg.predict([x])
    return price[0]

Lets predict the price for Sony, which is outside of our training set.

In [38]:
predict_price("SNE")

30.128168

## Using a Neural Net to Predict Prices 

In [None]:
(x_train, y_train, x_test, y_test) = train_test_split(financial_data, prices)
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(20, activation=tf.nn.relu),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1)
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test)