In [1]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as sm
from sklearn.linear_model import LinearRegression

In [2]:
# Automatic backwards elimination function (from Udemy course)

def backwardElimination(indep_vars, sl):
    regressor = LinearRegression()
    numVars = len(indep_vars[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, indep_vars).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    indep_vars = np.delete(indep_vars, j, 1)
    return regressor_OLS, indep_vars

In [3]:
# Defining dataset, significance level, and relevant lists

dataset = pd.read_pickle('master_df.pkl')
sig_level = 0.05
list_of_metrics = list(dataset.columns[-5:])
sector_list = list(set(dataset.sector))

In [4]:
# Assumption: ESG scores can be used to predict financial performance


'''

Independent Variables:

    1) Total ESG Score
    2) Environment Score
    3) Social Score
    4) Governance Score

'''

# Set empty list for data to be appended to
list_of_regressions_1 = []

# For each sector and metric, run a MLR to determine the statistically significant predictors
for i in sector_list:
    sector_1 = i
    for w in list_of_metrics:
        metric_1 = w
        num_rows = len(dataset[dataset['sector'] == i])
        ones = np.ones((num_rows,))
        relevant_i_vars = dataset[dataset['sector'] == i].loc[:,['totalESG','environmentScore','socialScore','governanceScore']]
        X = pd.DataFrame(ones, index = dataset[dataset['sector'] == i].index)
        X = pd.concat([X,relevant_i_vars], axis = 1)
        X = np.array(X)
        y = dataset[dataset['sector'] == i][w].values
        regressor_data, most_sig_indep_vars = backwardElimination(X,sig_level)
        adj_r_squared = regressor_data.rsquared_adj
        p_values = np.array(regressor_data.pvalues).round(3)
        number_of_vars = most_sig_indep_vars.shape[1]
        list_of_indep_vars = []
        for c in range(number_of_vars):
            a = most_sig_indep_vars[:,c]
            for z in dataset[dataset['sector'] == i].loc[:,['totalESG','environmentScore','socialScore','governanceScore']]:
                b = np.array(dataset[dataset['sector'] == i].loc[:,z])
                if (a==b).all():
                    list_of_indep_vars.append(str(z))
                else:
                    continue
                    
# Create a one-time list to be appended to the overall list
        result_data_1 = [sector_1,metric_1,list_of_indep_vars,p_values,adj_r_squared]
        list_of_regressions_1.append(result_data_1)



In [5]:
# Convert all regression output into a data frame
df_regression_1 = pd.DataFrame(list_of_regressions_1)

# Format and save the data frame for the first regression
df_regression_1.columns = ['sector', 'metric','sig_indep_vars','p_values','adj_r_squared']
pd.set_option('display.float_format', '{:.2%}'.format)
pd.set_option('display.max_colwidth', -1)
df_regression_1 = df_regression_1.sort_values(by=['sector','metric'], ascending=True)
# df_regression_1 = df_regression_1.sort_values(by= 'adj_r_squared', ascending=False)
for i in range(len(df_regression_1['p_values'])):
    if len(df_regression_1['p_values'][i]) > len(df_regression_1['sig_indep_vars'][i]):
        df_regression_1['p_values'][i] = df_regression_1['p_values'][i][1:]
for i in range(len(df_regression_1['sig_indep_vars'])):
    if len(df_regression_1['sig_indep_vars'][i]) == 0:
        df_regression_1['sig_indep_vars'][i] = 'constant_value'
#         df_regression_1['p_values'][i] = np.nan
df_regression_1['p_values'] = df_regression_1['p_values'].apply(np.asarray)
df_regression_1.to_pickle('df_regression_1.pkl')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [6]:
# Assumption: Relative ESG ranking vs. industry peers can be used to predict the relative financial performance

'''

Independent Variables:

    % Difference in firm's (vs. industry peers):
        1) Environment Score
        2) Social Score
        3) Governance Score

'''

# Set empty list for data to be appended to

list_of_regressions_2 = []

# For each sector and metric, run a MLR to determine the statistically significant predictors
for i in sector_list:
    sector_2 = i
    for w in list_of_metrics:
        metric_2 = w
        sector_avg = dataset[dataset['sector'] == i][w].mean()
        e_diff = dataset[dataset['sector'] == i]['environmentScore'] / dataset[dataset['sector'] == i]['peerEnvironment'] - 1
        s_diff = dataset[dataset['sector'] == i]['socialScore'] / dataset[dataset['sector'] == i]['peerSocial'] - 1
        g_diff = dataset[dataset['sector'] == i]['governanceScore'] / dataset[dataset['sector'] == i]['peerGovernance'] - 1
        num_rows = len(dataset[dataset['sector'] == i])
        ones = np.ones((num_rows,))
        X = pd.DataFrame(ones, index = dataset[dataset['sector'] == i].index)
        X = pd.concat([X,e_diff,s_diff,g_diff], axis = 1)
        X.columns = range(4)
        X = np.array(X)
        y = np.array((dataset[dataset['sector'] == i][w] / np.full((num_rows,),sector_avg)) - 1)
        regressor_data, most_sig_indep_vars = backwardElimination(X,sig_level)
        adj_r_squared = regressor_data.rsquared_adj
        p_values = np.array(regressor_data.pvalues).round(3)
        number_of_vars = most_sig_indep_vars.shape[1]
        list_of_indep_vars = []
        for c in range(number_of_vars):
            a = most_sig_indep_vars[:,c]
            for z in [e_diff,s_diff,g_diff]:
                b = np.array(z)
                if (a==b).all():
                    if(b==np.array(e_diff)).all():      
                        list_of_indep_vars.append('e_diff')
                    elif (b==np.array(s_diff)).all(): 
                        list_of_indep_vars.append('s_diff')
                    elif (b==np.array(g_diff)).all():
                        list_of_indep_vars.append('g_diff')

# Create a one-time list to be appended to the overall list
        result_data_2 = [sector_2,metric_2,list_of_indep_vars,p_values,adj_r_squared]
        list_of_regressions_2.append(result_data_2)



In [7]:
# Convert all regression output into a data frame
df_regression_2 = pd.DataFrame(list_of_regressions_2)

# Format and save the data frame for the second regression
df_regression_2.columns = ['sector', 'metric','sig_indep_vars','p_values','adj_r_squared']
pd.set_option('display.float_format', '{:.2%}'.format)
pd.set_option('display.max_colwidth', -1)
df_regression_2 = df_regression_2.sort_values(by=['sector','metric'], ascending=True)
# df_regression_2 = df_regression_2.sort_values(by= 'adj_r_squared', ascending=False)
for i in range(len(df_regression_2['p_values'])):
    if len(df_regression_2['p_values'][i]) > len(df_regression_2['sig_indep_vars'][i]):
        df_regression_2['p_values'][i] = df_regression_2['p_values'][i][1:]
for i in range(len(df_regression_2['sig_indep_vars'])):
    if len(df_regression_2['sig_indep_vars'][i]) == 0:
        df_regression_2['sig_indep_vars'][i] = 'constant_value'
#         df_regression_2['p_values'][i] = np.nan
df_regression_2['p_values'] = df_regression_2['p_values'].apply(np.asarray)
df_regression_2.to_pickle('df_regression_2.pkl')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
