## The purpose of this notebook is to evalaate whether LinearRegression is a suitable approach to understand and predict market caps of corporations, based on a large number of features and a dataset of corporation financial information

## NOTE: All the values represented are in millions

In [56]:
# Standard Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from statistics import mean
%matplotlib inline
path = './Private/Data/MasterDataset.csv'
conversion_factor = 1000000

In [31]:
master_frame = pd.read_csv(path, sep='\t')
master_frame.drop(['Unnamed: 0'],axis=1,inplace=True)
master_frame.dropna(inplace=True)

In [32]:
y_values = master_frame['current_market_cap_usd']
master_frame.drop(['current_ebit'],axis=1, inplace=True)
master_frame.drop(['current_market_cap_usd'],axis=1, inplace=True)
master_frame.drop(['quote_symbol'],axis=1, inplace=True)
master_frame.drop(['sedol'],axis=1, inplace=True)
master_frame.drop(['country'],axis=1, inplace=True)
master_frame.drop(['exchange'],axis=1, inplace=True)
master_frame.drop(['primary_sic_code'],axis=1, inplace=True)
master_frame.drop(['current_price_close'],axis=1, inplace=True)

## Feature List post dropping irrelevant columns:

In [33]:
for column in master_frame.columns:
    print(column)

current_sales
current_ebitda
current_net_income
current_total_assets
current_total_liabilities
current_pe_ratio
actual_eps
current_price_/_cash
current_price_/_sales
dividend_yield


In [34]:
# Performing a 60-30 train test split
x_train, x_test, y_train, y_test = train_test_split(master_frame,y_values,test_size=0.3,random_state=101)

In [35]:
reg = LinearRegression().fit(x_train, y_train)

In [36]:
#Score with exact training values
reg.score(x_train,y_train)

0.791159401554393

## Experimenting with some existing metrics to score the current model

In [37]:
y_predictions = reg.predict(x_test)
y_test = y_test.values

In [38]:
# Absolute scoring
abs_percentage = reg.score(x_test,y_test)
print("Raw Accuracy = {}%".format(abs_percentage*100))

Raw Accuracy = 56.64264353001825%


In [39]:
#Scoring using a root mean squared error
rmse = sqrt(mean_squared_error(y_test,y_predictions))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 34591.0932950143


## Trying to narrow the features down to a more concise set

In [40]:
master_frame['ebitda_to_sales'] = master_frame['current_ebitda']/master_frame['current_sales']
master_frame.drop(['current_ebitda'],axis=1, inplace=True)
master_frame.drop(['current_sales'],axis=1, inplace=True)

master_frame.drop(['current_net_income'],axis=1, inplace=True)
master_frame['net_assets'] = master_frame['current_total_assets'] - master_frame['current_total_liabilities']
#master_frame.drop(['current_total_assets'],axis=1, inplace=True)
#master_frame.drop(['current_total_liabilities'],axis=1, inplace=True)
#master_frame.drop(['current_price_/_sales'],axis=1, inplace=True)

In [41]:
x_train, x_test, y_train, y_test = train_test_split(master_frame,y_values,test_size=0.3,random_state=101)

In [42]:
reg = LinearRegression().fit(x_train, y_train)
reg.score(x_train,y_train)

0.5613198689496157

In [43]:
## Experimenting with some existing metrics to score the current model

y_predictions = reg.predict(x_test)
y_test = y_test.values

# Absolute scoring
abs_percentage = reg.score(x_test,y_test)
print("Raw Accuracy = {}%".format(abs_percentage*100))

#Scoring using a root mean squared error
rmse = sqrt(mean_squared_error(y_test,y_predictions))
print("Root Mean Squared Error: {}".format(rmse))

Raw Accuracy = 43.604136823362104%
Root Mean Squared Error: 39450.863657906295


## Narrowing down features doesn't make a substantial impact to the model

## Resetting the dataframe

In [44]:
master_frame = pd.read_csv(path, sep='\t')
master_frame.drop(['Unnamed: 0'],axis=1,inplace=True)
master_frame.dropna(inplace=True)

y_values = master_frame['current_market_cap_usd']
master_frame.drop(['current_ebit'],axis=1, inplace=True)
master_frame.drop(['current_market_cap_usd'],axis=1, inplace=True)
master_frame.drop(['quote_symbol'],axis=1, inplace=True)
master_frame.drop(['sedol'],axis=1, inplace=True)
master_frame.drop(['country'],axis=1, inplace=True)
master_frame.drop(['exchange'],axis=1, inplace=True)
master_frame.drop(['primary_sic_code'],axis=1, inplace=True)
master_frame.drop(['current_price_close'],axis=1, inplace=True)

In [45]:
# Performing a 60-30 train test split
x_train, x_test, y_train, y_test = train_test_split(master_frame,y_values,test_size=0.3,random_state=101)

In [46]:
reg = LinearRegression().fit(x_train, y_train)

#Score with exact training values
reg.score(x_train,y_train)

0.791159401554393

In [47]:
y_predictions = reg.predict(x_test)
y_test = y_test.values

## Using a modified RMSE function with a percentile acceptable bound, in order to better understand our outputs

In [49]:
bound = 0.1
def calc_bounded_error(y_pred, y_real):
    lower = y_real*(1-bound)
    upper = y_real*(1+bound)
    difference_low = (y_pred-lower)**2
    difference_up = (y_pred-upper)**2
    if difference_up > difference_low:
        return difference_low
    return difference_up

In [59]:
def modified_rmse(y_predictions, y_tests):
    my_list = []
    for i in range(0,len(y_predictions)):
        my_list.append(calc_bounded_error(y_predictions[i],y_tests[i]))
    mean_val = mean(my_list)
    modified_rmse = sqrt(mean_val)
    return modified_rmse

In [60]:
modified_rmse(y_predictions, y_test)

29643.770782869367

## TODO:
### Extract more data, in order to improve the regression model
### Explore the usage of a DNNRegressor in order to check whether that produces any better results