## The purpose of this notebook is to evalaate whether DNNRegression is a suitable approach to understand and predict market caps of corporations, based on a large number of features and a dataset of corporation financial information

## NOTE: All the values represented are in millions

In [None]:
# Standard Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from math import sqrt
from statistics import mean
%matplotlib inline
path = './Private/Data/MasterDataset.csv'
conversion_factor = 1000000

In [None]:
master_frame = pd.read_csv(path, sep='\t')
master_frame.drop(['Unnamed: 0'],axis=1,inplace=True)
master_frame.dropna(inplace=True)

In [None]:
y_values = master_frame['current_market_cap_usd']
master_frame.drop(['current_ebit'],axis=1, inplace=True)
master_frame.drop(['current_market_cap_usd'],axis=1, inplace=True)
master_frame.drop(['quote_symbol'],axis=1, inplace=True)
master_frame.drop(['sedol'],axis=1, inplace=True)
master_frame.drop(['country'],axis=1, inplace=True)
master_frame.drop(['exchange'],axis=1, inplace=True)
master_frame.drop(['primary_sic_code'],axis=1, inplace=True)
master_frame.drop(['current_price_close'],axis=1, inplace=True)

In [None]:
def standardise(frame):
    scaled_features = StandardScaler().fit_transform(frame.values)
    new_frame = pd.DataFrame(scaled_features, index=frame.index, columns=frame.columns)
    return new_frame

In [None]:
#master_frame = standardise(master_frame) #Commenting out the standardise function

## Feature List post dropping irrelevant columns:

In [None]:
# Performing a 70-30 train test split
x_train, x_test, y_train, y_test = train_test_split(master_frame,y_values,test_size=0.3,random_state=101)

In [None]:
for column in master_frame.columns:
    print(column)

In [None]:
sales = tf.feature_column.numeric_column("current_sales")
ebitda = tf.feature_column.numeric_column("current_ebitda")
net_income = tf.feature_column.numeric_column("current_net_income")
assets = tf.feature_column.numeric_column("current_total_assets")
liabilities = tf.feature_column.numeric_column("current_total_liabilities")
pe = tf.feature_column.numeric_column("current_pe_ratio")
eps = tf.feature_column.numeric_column("actual_eps")
price_cash = tf.feature_column.numeric_column("current_price_/_cash")
price_sales = tf.feature_column.numeric_column("current_price_/_sales")
dividend = tf.feature_column.numeric_column("dividend_yield")

features = [sales, ebitda, net_income, assets, liabilities, pe, eps, price_cash, price_sales, dividend]

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(x=x_train,y=y_train,batch_size=10,num_epochs=None,shuffle=True)
model_reg = tf.estimator.DNNRegressor(feature_columns=features, hidden_units=[6,10])

n_steps = 100000
model_reg.train(input_fn=input_func, steps=n_steps)

In [None]:
#Score with exact training values
prediction_func = tf.estimator.inputs.pandas_input_fn(x=x_train,batch_size=len(x_train),shuffle=False)
true_predictions = list(model_reg.predict(input_fn=prediction_func))
#print(true_predictions)
prediction_list = []
for prediction in true_predictions:
    prediction_list.append(prediction['predictions'][0])

## Using a modified RMSE function with a percentile acceptable bound, in order to better understand our outputs

In [None]:
bound = 0.1
def calc_bounded_error(y_pred, y_real):
    lower = y_real*(1-bound)
    upper = y_real*(1+bound)
    difference_low = (y_pred-lower)**2
    difference_up = (y_pred-upper)**2
    if difference_up > difference_low:
        return difference_low
    return difference_up

In [None]:
def modified_rmse(y_predictions, y_tests):
    my_list = []
    for i in range(0,len(y_predictions)):
        my_list.append(calc_bounded_error(y_predictions[i],y_tests[i]))
    mean_val = mean(my_list)
    modified_rmse = sqrt(mean_val)
    return modified_rmse

## Scoring the training data

In [None]:
y_train = y_train.values

In [None]:
modified_rmse(prediction_list,y_train)

## Scoring the testing data

In [None]:
#Score with test values
prediction_func = tf.estimator.inputs.pandas_input_fn(x=x_test,batch_size=len(x_test),shuffle=False)
true_predictions = list(model_reg.predict(input_fn=prediction_func))
#print(true_predictions)
prediction_list = []
for prediction in true_predictions:
    prediction_list.append(prediction['predictions'][0])

In [None]:
y_test = y_test.values

In [None]:
modified_rmse(prediction_list,y_test)

## Conclusion: Using a DNNRegressor does not create a marked differnece in RMSE value
## Standardising features actually results in a drop in accuracy