In [1]:
# 데이터 핸들링을 위한 패키지
import numpy as np
import pandas as pd

# 통계 학습을 위한 패키지
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 기계 학습을 위한 패키지
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 시각화를 위한 패키지
from matplotlib import pyplot as plt
import seaborn as sns

# 그래프를 실제로 그리기 위한 설정
%matplotlib inline

# 경고 메시지 무시
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = '/Users/parkjunhyeong/Desktop/박준형/02. 대내 및 대외활동/01. 대내활동/03. Biz&AI 랩/02. 소스/01. 데이터/06. 회귀분석 데이터'
news_df = pd.read_csv(path + '/Composed_Data_ver1.4.csv', index_col = 0)
news_df.dropna(inplace = True)
news_df.reset_index(drop = True, inplace = True)
news_df.columns

Index(['URL', 'Publish Date', 'Title', 'Main Text', 'Ticker', 'Description',
       'News Type', 'Sentiment', 'Investor Reaction', 'GICS Sectors',
       'Firm Age', 'Twitter Followers', 'Total Asset', 'ROA',
       'Positive Consumer Perception', 'Negative Consumer Perception',
       'clean_text', 'truncated_text', 'ESG Bert Score', 'ESG Bert'],
      dtype='object')

In [3]:
sector = "Industrials"
news_df = news_df[['News Type','ESG Bert', 'Sentiment','GICS Sectors',
                   'Firm Age', 'Twitter Followers','Total Asset', 'ROA',
                   'Investor Reaction','Positive Consumer Perception', 'Negative Consumer Perception']]
news_df = news_df[news_df['Sentiment'] != "Neutral"]
news_df.reset_index(drop = True, inplace = True)

In [4]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3451 entries, 0 to 3450
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   News Type                     3451 non-null   object 
 1   ESG Bert                      3451 non-null   object 
 2   Sentiment                     3451 non-null   object 
 3   GICS Sectors                  3451 non-null   object 
 4   Firm Age                      3451 non-null   int64  
 5   Twitter Followers             3451 non-null   int64  
 6   Total Asset                   3451 non-null   float64
 7   ROA                           3451 non-null   float64
 8   Investor Reaction             3451 non-null   float64
 9   Positive Consumer Perception  3451 non-null   float64
 10  Negative Consumer Perception  3451 non-null   float64
dtypes: float64(5), int64(2), object(4)
memory usage: 296.7+ KB


## 1. 변수들 표준화하기

In [5]:
#ROA Standard Scaling하기
ROA_array = np.array(news_df['ROA']).reshape(-1,1)
scaler1=StandardScaler()
scaler1.fit(ROA_array)
news_df['ROA'] = scaler1.transform(ROA_array)

#Total Asset Standard Scaling하기
TA_array = np.array(news_df['Total Asset']).reshape(-1,1)
scaler2=StandardScaler()
scaler2.fit(TA_array)
news_df['Total Asset'] = scaler2.transform(TA_array)

#Twitter Followers Standard Scaling하기
Twitter_array = np.array(news_df['Twitter Followers']).reshape(-1,1)
scaler3=StandardScaler()
scaler3.fit(Twitter_array)
news_df['Twitter Followers'] = scaler3.transform(Twitter_array)

#Firm Age Standard Scaling하기
FA_array = np.array(news_df['Firm Age']).reshape(-1,1)
scaler4=StandardScaler()
scaler4.fit(FA_array)
news_df['Firm Age'] = scaler4.transform(FA_array)

## 2. 필요한 칼럼들 세팅하기

In [6]:
#news_df1: news sent 긍정, Consumer Perception 긍정
#news_df1: news sent 긍정, Consumer Perception 부정
#news_df1: news sent 부정, Consumer Perception 긍정
#news_df1: news sent 부정, Consumer Perception 부정
# news_df1 = pd.get_dummies(news_df, columns = ['Sentiment', 'GICS Sectors'])
# news_df1.drop(['Sentiment_Negative', 'Negative Consumer Perception',
#               'GICS Sectors_' + sector], axis = 1, inplace = True)

# news_df2 = pd.get_dummies(news_df, columns = ['Sentiment', 'GICS Sectors'])
# news_df2.drop(['Sentiment_Negative', 'Positive Consumer Perception',
#               'GICS Sectors_' + sector], axis = 1, inplace = True)

# news_df3 = pd.get_dummies(news_df, columns = ['Sentiment', 'GICS Sectors'])
# news_df3.drop(['Sentiment_Positive', 'Negative Consumer Perception',
#               'GICS Sectors_' + sector], axis = 1, inplace = True)

news_df4 = pd.get_dummies(news_df, columns = ['Sentiment', 'GICS Sectors'])
news_df4.drop(['Sentiment_Positive', 'Positive Consumer Perception',
              'GICS Sectors_' + sector], axis = 1, inplace = True)

## 3. 뉴스 타입별 회귀분석 실시하기

## 1) GICS Sector들 중에서 Industrials를 baseline으로 
## 2) 뉴스 Sentiment와 Consumper Perception 모두 부정을 기준으로 계산한 경우 중
## 주목해봐야 할 경우들의 회귀 분석 결과

In [7]:
news_df['ESG Bert'].unique()

array(['Data_Security', 'Employee_Engagement_Inclusion_And_Diversity',
       'Product_Design_And_Lifecycle_Management',
       'Systemic_Risk_Management', 'Business_Ethics',
       'Management_Of_Legal_And_Regulatory_Framework', 'Labor_Practices',
       'Employee_Health_And_Safety', 'Competitive_Behavior',
       'Customer_Privacy', 'Critical_Incident_Risk_Management',
       'Physical_Impacts_Of_Climate_Change', 'GHG_Emissions',
       'Business_Model_Resilience', 'Director_Removal',
       'Energy_Management', 'Air_Quality', 'Customer_Welfare',
       'Product_Quality_And_Safety', 'Access_And_Affordability',
       'Waste_And_Hazardous_Materials_Management',
       'Selling_Practices_And_Product_Labeling',
       'Water_And_Wastewater_Management',
       'Human_Rights_And_Community_Relations', 'Supply_Chain_Management',
       'Ecological_Impacts'], dtype=object)

In [8]:
news = news_df4
#1. News
df_reg1 = news
#2. ESG News
df_reg2 = news[news['News Type'] == 'ESG']
#3. Non-ESG News
df_reg3 = news[(news['News Type'] == 'Non-ESG')| (news['News Type'] == 'Financial Performance')]
#4. Financial News
df_reg4 = news[news['News Type'] == 'Financial Performance']
#5. E News
df_reg5 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'GHG_Emissions') |
                                                     (news['ESG Bert'] == 'Air_Quality') |
                                                     (news['ESG Bert'] == 'Energy_Management') |
                                                     (news['ESG Bert'] == 'Ecological_Impacts') |
                                                     (news['ESG Bert'] == 'Water_And_Wastewater_Management') |
                                                     (news['ESG Bert'] == 'Waste_And_Hazardous_Materials_Management'))]
#6. S News
df_reg6 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'Customer_Privacy') |
                                                     (news['ESG Bert'] == 'Data_Security') |
                                                     (news['ESG Bert'] == 'Access_And_Affordability') |
                                                     (news['ESG Bert'] == 'Product_Quality_And_Safety') |
                                                     (news['ESG Bert'] == 'Customer_Welfare') |
                                                     (news['ESG Bert'] == 'Selling_Practices_And_Product_Labeling') |
                                                     (news['ESG Bert'] == 'Human_Rights_And_Community_Relations') |
                                                     (news['ESG Bert'] == 'Labor_Practices') |
                                                     (news['ESG Bert'] == 'Employee_Health_And_Safety') |
                                                     (news['ESG Bert'] == 'Employee_Engagement_Inclusion_And_Diversity')
                                                       )]

#7. G News
df_reg7 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'Business_Ethics') |
                                                     (news['ESG Bert'] == 'Competitive_Behavior') |
                                                     (news['ESG Bert'] == 'Systemic_Risk_Management') |
                                                     (news['ESG Bert'] == 'Critical_Incident_Risk_Management') |
                                                     (news['ESG Bert'] == 'Management_Of_Legal_And_Regulatory_Framework') |
                                                     (news['ESG Bert'] == 'Supply_Chain_Management') |
                                                     (news['ESG Bert'] == 'Business_Model_Resilience') |
                                                     (news['ESG Bert'] == 'Physical_Impacts_Of_Climate_Change') |
                                                     (news['ESG Bert'] == 'Director_Removal') |
                                                     (news['ESG Bert'] == 'Product_Design_And_Lifecycle_Management'))]
#8. Environment News
df_reg8 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'GHG_Emissions') |
                                                     (news['ESG Bert'] == 'Air_Quality') |
                                                     (news['ESG Bert'] == 'Energy_Management') |
                                                     (news['ESG Bert'] == 'Ecological_Impacts') |
                                                     (news['ESG Bert'] == 'Water_And_Wastewater_Management') |
                                                     (news['ESG Bert'] == 'Waste_And_Hazardous_Materials_Management'))]

#9. Social Capital News
df_reg9 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'Customer_Privacy') |
                                                     (news['ESG Bert'] == 'Data_Security') |
                                                     (news['ESG Bert'] == 'Access_And_Affordability') |
                                                     (news['ESG Bert'] == 'Product_Quality_And_Safety') |
                                                     (news['ESG Bert'] == 'Customer_Welfare') |
                                                     (news['ESG Bert'] == 'Selling_Practices_And_Product_Labeling') |
                                                     (news['ESG Bert'] == 'Human_Rights_And_Community_Relations'))]

#10. Human Capital News
df_reg10 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'Labor_Practices') |
                                                (news['ESG Bert'] == 'Employee_Health_And_Safety') |
                                                (news['ESG Bert'] == 'Employee_Engagement_Inclusion_And_Diversity'))]

#11. Leadership & Governance News
df_reg11 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'Business_Ethics') |
                                                     (news['ESG Bert'] == 'Competitive_Behavior') |
                                                     (news['ESG Bert'] == 'Systemic_Risk_Management') |
                                                     (news['ESG Bert'] == 'Critical_Incident_Risk_Management') |
                                                     (news['ESG Bert'] == 'Management_Of_Legal_And_Regulatory_Framework'))]

#12. Business Model & Innovation News
df_reg12 = news[(news['News Type'] == 'ESG') & ((news['ESG Bert'] == 'Supply_Chain_Management') |
                                                (news['ESG Bert'] == 'Business_Model_Resilience') |
                                                (news['ESG Bert'] == 'Physical_Impacts_Of_Climate_Change') |
                                                (news['ESG Bert'] == 'Director_Removal') |
                                                (news['ESG Bert'] == 'Product_Design_And_Lifecycle_Management'))]

# df_reg8 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'GHG_Emissions')]
# df_reg9 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Air_Quality')]
# df_reg10 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Energy_Management')]
# df_reg11 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Ecological_Impacts')]
# df_reg12 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Water_And_Wastewater_Management')]
# df_reg13 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Waste_And_Hazardous_Materials_Management')]
# df_reg14 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Customer_Privacy')]
# df_reg15 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Data_Security')]
# df_reg16 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Access_And_Affordability')]
# df_reg17 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Product_Quality_And_Safety')]
# df_reg18 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Customer_Welfare')]
# df_reg19 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Selling_Practices_And_Product_Labeling')]
# df_reg20 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Human_Rights_And_Community_Relations')]
# df_reg21 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Labor_Practices')]
# df_reg22 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Employee_Health_And_Safety')]
# df_reg23 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Employee_Engagement_Inclusion_And_Diversity')]
# df_reg24 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Business_Ethics')]
# df_reg25 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Competitive_Behavior')]
# df_reg26 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Systemic_Risk_Management')]
# df_reg27 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Critical_Incident_Risk_Management')]
# df_reg28 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Management_Of_Legal_And_Regulatory_Framework')]
# df_reg29 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Supply_Chain_Management')]
# df_reg30 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Business_Model_Resilience')]
# df_reg31 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Physical_Impacts_Of_Climate_Change')]
# df_reg32 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Director_Removal')]
# df_reg33 = news[(news['News Type'] == 'ESG') & (news['esg_bert'] == 'Product_Design_And_Lifecycle_Management')]

## 3-1. Investor Reaction을 종속변수로 하는 경우 중 Human Capital News (Adj R-squared 값이 유독 높게 나옴)

In [9]:
#종속변수 - Investor Reaction
df_reg = df_reg10
df_kc_reg = sm.add_constant(df_reg, has_constant='add')
feature_columns = list(df_kc_reg.columns.difference(['News Type','ESG Bert','Investor Reaction', 'Negative Consumer Perception']))

X = df_kc_reg[feature_columns]
y = df_kc_reg['Investor Reaction']

multi_linear_model = sm.OLS(y, X)
result_model_1 = multi_linear_model.fit()
result_model_1.summary()

0,1,2,3
Dep. Variable:,Investor Reaction,R-squared:,0.348
Model:,OLS,Adj. R-squared:,0.334
Method:,Least Squares,F-statistic:,24.35
Date:,"Mon, 22 May 2023",Prob (F-statistic):,3.8800000000000003e-47
Time:,11:30:36,Log-Likelihood:,1444.4
No. Observations:,606,AIC:,-2861.0
Df Residuals:,592,BIC:,-2799.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Firm Age,-0.0025,0.001,-2.301,0.022,-0.005,-0.000
GICS Sectors_Communication Services,-0.0014,0.004,-0.369,0.712,-0.009,0.006
GICS Sectors_Consumer Discretionary,0.0004,0.003,0.113,0.910,-0.006,0.007
GICS Sectors_Consumer Staples,0.0031,0.005,0.627,0.531,-0.007,0.013
GICS Sectors_Energy,0.0178,0.016,1.089,0.277,-0.014,0.050
GICS Sectors_Financials,0.0054,0.006,0.842,0.400,-0.007,0.018
GICS Sectors_Health Care,0.0066,0.007,0.932,0.352,-0.007,0.021
GICS Sectors_Information Technology,0.0040,0.006,0.697,0.486,-0.007,0.015
GICS Sectors_Materials,-6.282e-16,4.48e-17,-14.019,0.000,-7.16e-16,-5.4e-16

0,1,2,3
Omnibus:,198.863,Durbin-Watson:,2.028
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5917.108
Skew:,-0.801,Prob(JB):,0.0
Kurtosis:,18.224,Cond. No.,1.94e+17


## 3-2. Customer Perception을 종속변수로 하는 경우 중 Environment News (Adj R-squared 값이 유독 높게 나옴)

In [10]:
#종속변수 - Investor Reaction
df_reg = df_reg8
df_kc_reg = sm.add_constant(df_reg, has_constant='add')
feature_columns = list(df_kc_reg.columns.difference(['News Type','ESG Bert','Investor Reaction', 'Negative Consumer Perception']))

X = df_kc_reg[feature_columns]
y = df_kc_reg['Negative Consumer Perception']

multi_linear_model = sm.OLS(y, X)
result_model_1 = multi_linear_model.fit()
result_model_1.summary()

0,1,2,3
Dep. Variable:,Negative Consumer Perception,R-squared:,0.612
Model:,OLS,Adj. R-squared:,0.502
Method:,Least Squares,F-statistic:,5.571
Date:,"Mon, 22 May 2023",Prob (F-statistic):,6.13e-06
Time:,11:32:15,Log-Likelihood:,65.328
No. Observations:,60,AIC:,-102.7
Df Residuals:,46,BIC:,-73.34
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Firm Age,-0.0285,0.024,-1.198,0.237,-0.076,0.019
GICS Sectors_Communication Services,-0.2559,0.073,-3.507,0.001,-0.403,-0.109
GICS Sectors_Consumer Discretionary,-0.1774,0.040,-4.427,0.000,-0.258,-0.097
GICS Sectors_Consumer Staples,-0.4002,0.080,-5.017,0.000,-0.561,-0.240
GICS Sectors_Energy,-0.2799,0.062,-4.512,0.000,-0.405,-0.155
GICS Sectors_Financials,-2.115e-16,4.04e-17,-5.233,0.000,-2.93e-16,-1.3e-16
GICS Sectors_Health Care,-0.2734,0.064,-4.248,0.000,-0.403,-0.144
GICS Sectors_Information Technology,-0.2407,0.079,-3.041,0.004,-0.400,-0.081
GICS Sectors_Materials,-0.0762,0.119,-0.642,0.524,-0.315,0.163

0,1,2,3
Omnibus:,7.215,Durbin-Watson:,1.785
Prob(Omnibus):,0.027,Jarque-Bera (JB):,6.556
Skew:,-0.638,Prob(JB):,0.0377
Kurtosis:,3.998,Cond. No.,6.76e+17
