# Modeling

In [9]:
import pandas as pd
import numpy as np
import pypfopt
#
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [10]:
#df_monthly = pd.read_csv('../../data/df_monthly.csv', index_col=0)
df = pd.read_csv('../../data/df_overview.csv', index_col=0) 
df

Unnamed: 0,company_name,company_esg_score,company_esg_score_group,stock_exchange,stock_ticker_symbol,market_capital,market_capital_euro,trailing_pe,beta,return_on_equity,...,industry_Semiconductors,industry_Software & Services,industry_Technology Hardware,industry_Telecommunication Services,industry_Textiles & Apparel,industry_Traders & Distributors,industry_Transportation,industry_Transportation Infrastructure,industry_Utilities,stock_ticker_label
0,RS Group Plc,4.5,Negligible ESG Risk,LON,RS1.L,3.668791e+09,4.365861e+09,19.858974,0.863,0.13226,...,0,0,1,0,0,0,0,0,0,1377
1,"Kimball Electronics, Inc.",4.5,Negligible ESG Risk,NAS,KE,4.281351e+08,3.938843e+08,21.370369,1.258,0.03854,...,0,0,1,0,0,0,0,0,0,1021
2,TAG Immobilien AG,4.6,Negligible ESG Risk,ETR,TEG.DE,2.765612e+09,2.765612e+09,,1.037,-0.03766,...,0,0,0,0,0,0,0,0,0,1493
3,LEG Immobilien SE,5.1,Negligible ESG Risk,ETR,LEG.DE,6.927171e+09,6.927171e+09,,0.957,-0.08126,...,0,0,0,0,0,0,0,0,0,1060
4,"Steelcase, Inc.",5.3,Negligible ESG Risk,NYS,SCS,1.492860e+09,1.373431e+09,17.210526,1.314,0.10633,...,0,0,0,0,0,0,0,0,0,1406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,"KVH Industries, Inc. (Delaware)",20.0,Low ESG Risk,NAS,KVHI,9.127629e+07,9.127629e+07,,0.594,-0.14536,...,0,0,1,0,0,0,0,0,0,1044
1658,Moonpig Group Plc,20.0,Low ESG Risk,LON,MOON.L,7.018797e+08,7.018797e+08,2035.000000,1.238,,...,0,0,0,0,0,0,0,0,0,1135
1659,"NeoGenomics, Inc.",20.0,Low ESG Risk,NAS,NEO,1.940312e+09,1.940312e+09,,1.191,-0.08391,...,0,0,0,0,0,0,0,0,0,1170
1660,"Japan Material Co., Ltd.",20.0,Low ESG Risk,TKS,6055.T,1.704423e+11,1.704423e+11,27.933996,0.585,0.13585,...,1,0,0,0,0,0,0,0,0,141


### Define Source and Target

Columns:
1. Static fields:
    * Industry
    * Market capital
    * Trailing P/E ratio
    * Beta
    * Return on equity
2. Time series data:
    * Market returns

In [11]:
df_columns = []
# Stock ticket label
df_columns.append('stock_ticket_label')
# Industry
df_columns = [s for s in df.columns.to_list() if "industry_" in s]
#
df_columns.append('company_esg_score')
#
df_columns.append('market_capital_euro')
df_columns.append('trailing_pe')
df_columns.append('beta')
df_columns.append('return_on_equity')

In [12]:
df[df_columns]

Unnamed: 0,industry_Aerospace & Defense,industry_Auto Components,industry_Automobiles,industry_Banks,industry_Building Products,industry_Chemicals,industry_Commercial Services,industry_Construction Materials,industry_Consumer Durables,industry_Consumer Services,...,industry_Textiles & Apparel,industry_Traders & Distributors,industry_Transportation,industry_Transportation Infrastructure,industry_Utilities,company_esg_score,market_capital_euro,trailing_pe,beta,return_on_equity
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4.5,4.365861e+09,19.858974,0.863,0.13226
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4.5,3.938843e+08,21.370369,1.258,0.03854
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4.6,2.765612e+09,,1.037,-0.03766
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5.1,6.927171e+09,,0.957,-0.08126
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,5.3,1.373431e+09,17.210526,1.314,0.10633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,20.0,9.127629e+07,,0.594,-0.14536
1658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,20.0,7.018797e+08,2035.000000,1.238,
1659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,20.0,1.940312e+09,,1.191,-0.08391
1660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,20.0,1.704423e+11,27.933996,0.585,0.13585


In [13]:

X = df[df_columns]
y = df[['return_rate_5y_avg']]

### Split the data into training and testing sets

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train.head(5)

Unnamed: 0,industry_Aerospace & Defense,industry_Auto Components,industry_Automobiles,industry_Banks,industry_Building Products,industry_Chemicals,industry_Commercial Services,industry_Construction Materials,industry_Consumer Durables,industry_Consumer Services,...,industry_Textiles & Apparel,industry_Traders & Distributors,industry_Transportation,industry_Transportation Infrastructure,industry_Utilities,company_esg_score,market_capital_euro,trailing_pe,beta,return_on_equity
566,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,14.6,4014910000.0,17.044916,1.224,0.69402
266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12.0,6330577000.0,19.405405,0.676,0.07415
148,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,10.2,315876200.0,,2.052,
1595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,19.7,368864900.0,,0.781,
1612,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,19.7,3200679000.0,2353.7312,1.427,0.28742


In [16]:
X_train

Unnamed: 0,industry_Aerospace & Defense,industry_Auto Components,industry_Automobiles,industry_Banks,industry_Building Products,industry_Chemicals,industry_Commercial Services,industry_Construction Materials,industry_Consumer Durables,industry_Consumer Services,...,industry_Textiles & Apparel,industry_Traders & Distributors,industry_Transportation,industry_Transportation Infrastructure,industry_Utilities,company_esg_score,market_capital_euro,trailing_pe,beta,return_on_equity
566,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,14.6,4.014910e+09,17.044916,1.224,0.69402
266,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,12.0,6.330577e+09,19.405405,0.676,0.07415
148,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,10.2,3.158762e+08,,2.052,
1595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,19.7,3.688649e+08,,0.781,
1612,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,19.7,3.200679e+09,2353.731200,1.427,0.28742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,17.6,2.425311e+08,,0.487,-0.22292
1294,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,18.3,1.361177e+10,13.233306,1.403,0.19860
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,16.2,1.045330e+09,10.801234,1.882,0.23582
1459,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,19.0,2.339876e+09,7.588889,0.979,0.11874


In [17]:
# Initialize and train the DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE) on Test Set: {mse:.2f}")

ValueError: Input y contains NaN.

### Plotting

In [None]:
plt.scatter(X_test, y_test, color="blue", label="Actual")
plt.scatter(X_test, y_pred, color="red", label="Predicted")
plt.title("DecisionTreeRegressor: Actual vs Predicted")
plt.xlabel("X")
plt.ylabel("y")
plt.legend()
plt.show()