# 라이브러리 import

In [1]:
import pandas as pd
import numpy as np

import scipy.stats as stats
from scipy.stats import probplot
from statsmodels.formula.api import ols

# 데이터 import

In [2]:
# col 생략 없이 출력
pd.set_option('display.max_columns', None)

In [3]:
# 개별 데이터 테이블 import
track=pd.read_csv('./../../data/track.csv')
track_curve=pd.read_csv('./../../data/track_curve.csv')
track_obstacle=pd.read_csv('./../../data/track_obstacle.csv')
track_road=pd.read_csv('./../../data/track_road.csv')
track_shortcut=pd.read_csv('./../../data/track_shortcut.csv')
track_straight=pd.read_csv('./../../data/track_straight.csv')
track_trigger=pd.read_csv('./../../data/track_trigger.csv')
match_indicator=pd.read_csv('./../../api/match-indicator-extraction.csv', encoding='cp949')
track_curve['all_section']=track_curve['hairpin']+track_curve['acute']+track_curve['obtuse']

In [4]:
# 이름 설정
track_straight=pd.concat([track_straight.iloc[:, 0:1], track_straight.iloc[:, 1:].add_prefix('straight_')], axis=1)
track_curve=pd.concat([track_curve.iloc[:, 0:1], track_curve.iloc[:, 1:].add_prefix('curve_')], axis=1)
track_obstacle=pd.concat([track_obstacle.iloc[:, 0:1], track_obstacle.iloc[:, 1:].add_prefix('obstacle_')], axis=1)
track_trigger=pd.concat([track_trigger.iloc[:, 0:1], track_trigger.iloc[:, 1:].add_prefix('trigger_')], axis=1)
track_shortcut=pd.concat([track_shortcut.iloc[:, 0:1], track_shortcut.iloc[:, 1:].add_prefix('shortcut_')], axis=1)
track.rename(columns={'id':'track_id', 'name':'track_name'},inplace=True)
track['date']=pd.to_datetime(track['release_date'], errors='coerce')
track['year']=track['date'].dt.year.astype(int, errors = 'ignore')
track['month']=track['date'].dt.month.astype(int, errors = 'ignore')
track.drop(['date', 'release_date'], axis=1, inplace=True)
match_indicator=match_indicator.iloc[:, 1:]

In [5]:
# 데이터 merge
df_track=pd.merge(track, match_indicator, how='left', on='track_id')
df_track=pd.merge(df_track, track_straight, how='left', on='track_id')
df_track=pd.merge(df_track, track_trigger, how='left', on='track_id')
df_track=pd.merge(df_track, track_curve, how='left', on='track_id')
df_track=pd.merge(df_track, track_shortcut, how='left', on='track_id')
df_track=pd.merge(df_track, track_obstacle, how='left', on='track_id')
df_track['sum_straight_curve']=df_track['straight_all_section']+df_track['curve_all_section']
tmp1=df_track.sort_values(by='cnt_match', ascending=False)[:44]
tmp2=df_track.sort_values(by='cnt_match', ascending=False)[44:]
tmp1['track_upper']=1
tmp2['track_upper']=0
df_track=pd.concat([tmp1, tmp2])
df_track.sort_index(inplace=True)
df_track=pd.merge(df_track, track_road, how='left', on='track_id')
df_track.drop('track_id', axis=1, inplace=True)

# 파생변수 추가

In [6]:
# 1. 오르막 내리막

df_track['updownhill'] = df_track['straight_uphill'] + df_track['straight_downhill']

In [7]:
# 2. 직선구간 비율 (직선구간 / 전체구간)

df_track['ratio_straight'] = df_track['straight_all_section'] / df_track['sum_straight_curve']

In [8]:
# 3. 곡선구간 비율 (곡선구간 / 전체구간)

df_track['ratio_curve'] = df_track['curve_all_section'] / df_track['sum_straight_curve']

In [9]:
# 4. 헤어핀 비율

df_track['ratio_hairpin'] = df_track['curve_hairpin'] / df_track['curve_all_section']

In [10]:
# 5. 예각 비율

df_track['ratio_acute'] = df_track['curve_acute'] / df_track['curve_all_section']

In [11]:
# 6. 둔각 비율

df_track['ratio_obtuse'] = df_track['curve_obtuse'] / df_track['curve_all_section']

In [12]:
# 7. 직각 지름길 비율

df_track['ratio_shortcut_right'] =  df_track['shortcut_right'] / (df_track['shortcut_right'] + df_track['shortcut_acute'] + df_track['shortcut_obtuse']+ df_track['shortcut_special'])

In [13]:
# 8. 예각 지름길 비율

df_track['ratio_shortcut_acute'] =  df_track['shortcut_acute'] /  (df_track['shortcut_right'] + df_track['shortcut_acute'] + df_track['shortcut_obtuse']+ df_track['shortcut_special'])

In [14]:
# 9. 둔각 지름길 비율

df_track['ratio_shortcut_obtuse'] =  df_track['shortcut_obtuse'] /  (df_track['shortcut_right'] + df_track['shortcut_acute'] + df_track['shortcut_obtuse']+ df_track['shortcut_special'])

In [15]:
# 10. 특수 지름길 비율

df_track['ratio_shortcut_special'] =  df_track['shortcut_special'] /  (df_track['shortcut_right'] + df_track['shortcut_acute'] + df_track['shortcut_obtuse']+ df_track['shortcut_special'])

In [16]:
# 11. 장애물

df_track['obstacle'] = df_track['obstacle_fixed'] + df_track['obstacle_moved']

In [17]:
# 12. 평균 속력

df_track['speed'] = df_track['AVG_record'] / df_track['length']

In [18]:
# 13. 지름길 총 개수

df_track['shortcut_total'] = df_track['shortcut_right'] + df_track['shortcut_acute'] + df_track['shortcut_obtuse'] + df_track['shortcut_special']

In [19]:
# 14. 주행보조요소 총 개수

df_track['trigger_total'] = df_track['trigger_decel'] + df_track['trigger_accel'] + df_track['trigger_jump'] + df_track['trigger_warp']

In [48]:
# 15. 내리막길 비율

df_track['ratio_downhill'] = df_track['straight_downhill'] / df_track['straight_downhill'] + df_track['straight_uphill']

In [21]:
# 16. 직선 구간 비율

df_track['ratio_straight'] = df_track['straight_all_section'] / df_track['sum_straight_curve']

In [57]:
df_track = df_track.fillna(0)

# 왜도 확인 및 로그 변환

In [23]:
df_track.skew()

  df_track.skew()


difficulty                 0.970351
lap                       -0.415763
length                     1.587628
year                       0.055575
month                     -0.098579
cnt_match                  4.389513
percent_retire             0.337072
AVG_record                 2.567460
straight_jump              2.410467
straight_uphill            1.394785
straight_downhill          1.159320
straight_all_section       0.225244
trigger_accel              4.418296
trigger_warp               2.867280
trigger_decel              1.210916
trigger_jump               2.714076
curve_hairpin              1.826973
curve_acute                0.900958
curve_obtuse               1.720740
curve_continuous           2.017556
curve_continuous_acute     1.949774
curve_continuous_obtuse    2.543348
curve_all_section          1.967247
shortcut_right             3.459384
shortcut_acute             2.492672
shortcut_obtuse            6.883657
shortcut_special           2.254017
obstacle_fixed             1

In [24]:
# 왜도가 2 이상인 컬럼들 리스트에 추가
skew_list = ['straight_jump', 'trigger_accel', 'trigger_warp', 'curve_continuous', 'curve_continuous_obtuse',
             'shortcut_right', 'shortcut_acute', 'shortcut_obtuse', 'shortcut_special', 'obstacle_moved',
             'lap', 'AVG_record', 'ratio_shortcut_right', 'ratio_shortcut_obtuse', 'ratio_shortcut_special',
             'speed', 'trigger_total']

# skew_list 에 담긴 컬럼들을 전부 로그 변환
for i in range(7):
    skew_col = skew_list[i]
    df_track[skew_col] = np.log1p(df_track[skew_col])

In [25]:
# 로그 변환 후 왜도 재확인
df_track.skew()

  df_track.skew()


difficulty                 0.970351
lap                       -0.415763
length                     1.587628
year                       0.055575
month                     -0.098579
cnt_match                  4.389513
percent_retire             0.337072
AVG_record                 2.567460
straight_jump              2.145526
straight_uphill            1.394785
straight_downhill          1.159320
straight_all_section       0.225244
trigger_accel              1.875047
trigger_warp               1.482410
trigger_decel              1.210916
trigger_jump               2.714076
curve_hairpin              1.826973
curve_acute                0.900958
curve_obtuse               1.720740
curve_continuous          -0.041103
curve_continuous_acute     1.949774
curve_continuous_obtuse    0.853129
curve_all_section          1.967247
shortcut_right             1.922402
shortcut_acute             1.620681
shortcut_obtuse            6.883657
shortcut_special           2.254017
obstacle_fixed             1

# 다중 회귀 분석 - 후진 선택

### 모든 지표

In [41]:
# 모든 지표 : 수정 R제곱 0.811 AIC 2020 BIC 2055

ols('AVG_record ~ updownhill + curve_all_section + ratio_curve + ratio_hairpin + ratio_acute + ratio_obtuse + ratio_shortcut_right + ratio_shortcut_acute + ratio_shortcut_obtuse + ratio_shortcut_special + obstacle + shortcut_total + difficulty + straight_jump', df_track).fit().summary()


0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.839
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,29.67
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,4.0599999999999995e-24
Time:,02:28:27,Log-Likelihood:,-996.03
No. Observations:,88,AIC:,2020.0
Df Residuals:,74,BIC:,2055.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.261e+04,6053.438,7.039,0.000,3.05e+04,5.47e+04
updownhill,1321.3782,322.138,4.102,0.000,679.504,1963.252
curve_all_section,2347.4825,293.029,8.011,0.000,1763.609,2931.356
ratio_curve,-1.429e+05,2.96e+04,-4.834,0.000,-2.02e+05,-8.4e+04
ratio_hairpin,4.107e+04,1.16e+04,3.544,0.001,1.8e+04,6.42e+04
ratio_acute,1113.6577,7573.127,0.147,0.883,-1.4e+04,1.62e+04
ratio_obtuse,429.2156,9647.919,0.044,0.965,-1.88e+04,1.97e+04
ratio_shortcut_right,-2.186e+04,1.21e+04,-1.810,0.074,-4.59e+04,2206.260
ratio_shortcut_acute,-5584.2737,9271.958,-0.602,0.549,-2.41e+04,1.29e+04

0,1,2,3
Omnibus:,30.942,Durbin-Watson:,1.962
Prob(Omnibus):,0.0,Jarque-Bera (JB):,76.285
Skew:,1.206,Prob(JB):,2.72e-17
Kurtosis:,6.871,Cond. No.,6.29e+17


### p-value 0.05 이상인 것들 제외해보기

In [42]:
# 오르막내리막, 전체 곡선 개수, 곡선 비율, 헤어핀 비율, 난이도, 점프 구간
# 수정 R제곱 0.814 AIC 2013 BIC 2030

ols('AVG_record ~ updownhill + curve_all_section + ratio_curve + ratio_hairpin + difficulty + straight_jump', df_track).fit().summary()


0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.827
Model:,OLS,Adj. R-squared:,0.814
Method:,Least Squares,F-statistic:,64.33
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,9.35e-29
Time:,02:28:43,Log-Likelihood:,-999.31
No. Observations:,88,AIC:,2013.0
Df Residuals:,81,BIC:,2030.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.689e+04,6551.696,7.157,0.000,3.39e+04,5.99e+04
updownhill,1281.6457,304.485,4.209,0.000,675.815,1887.476
curve_all_section,2287.0107,245.101,9.331,0.000,1799.337,2774.684
ratio_curve,-1.464e+05,2.39e+04,-6.129,0.000,-1.94e+05,-9.88e+04
ratio_hairpin,3.302e+04,1.35e+04,2.449,0.016,6195.808,5.98e+04
difficulty,1.119e+04,3606.145,3.102,0.003,4010.284,1.84e+04
straight_jump,1.09e+04,5169.074,2.108,0.038,611.092,2.12e+04

0,1,2,3
Omnibus:,27.832,Durbin-Watson:,1.91
Prob(Omnibus):,0.0,Jarque-Bera (JB):,64.446
Skew:,1.101,Prob(JB):,1.01e-14
Kurtosis:,6.568,Cond. No.,395.0


#### 다중공선성 체크
10을 기준으로 봄

In [43]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

x = df_track[['updownhill', 'curve_all_section', 'ratio_curve', 'ratio_hairpin', 'difficulty', 'straight_jump']]

def feature_vif(x):
  vif = pd.DataFrame()
  vif['vif_factor'] = [variance_inflation_factor(x.values, i)
                      for i in range(x.shape[1])]
  vif['feature'] = x.columns
  return vif
vif = feature_vif(x)
print(vif)

# 곡선 개수와 곡선 비율, 난이도 다중공선성 높음

   vif_factor            feature
0    4.163368         updownhill
1   13.787361  curve_all_section
2   11.649244        ratio_curve
3    2.250814      ratio_hairpin
4   18.020661         difficulty
5    1.305434      straight_jump


### 다중공선성 높은 곡선 비율, 난이도 제외

In [30]:
# 오르막내리막, 전체 곡선 개수, 헤어핀 비율, 점프 구간
# 수정 R제곱 0.725 AIC 2045 BIC 2057

ols('AVG_record ~ updownhill + curve_all_section + ratio_hairpin + straight_jump', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.738
Model:,OLS,Adj. R-squared:,0.725
Method:,Least Squares,F-statistic:,58.36
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,2.39e-23
Time:,02:27:57,Log-Likelihood:,-1017.5
No. Observations:,88,AIC:,2045.0
Df Residuals:,83,BIC:,2057.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.686e+04,6834.384,5.394,0.000,2.33e+04,5.05e+04
updownhill,1330.7852,356.554,3.732,0.000,621.613,2039.957
curve_all_section,1965.1503,189.535,10.368,0.000,1588.173,2342.128
ratio_hairpin,3.58e+04,1.49e+04,2.401,0.019,6145.143,6.54e+04
straight_jump,1.295e+04,6266.745,2.067,0.042,488.065,2.54e+04

0,1,2,3
Omnibus:,19.083,Durbin-Watson:,1.787
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.664
Skew:,0.927,Prob(JB):,5.97e-07
Kurtosis:,5.094,Cond. No.,206.0


In [31]:
# 다중공선성 체크 - 문제 없음

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

x = df_track[['updownhill', 'curve_all_section', 'ratio_hairpin', 'straight_jump']]

def feature_vif(x):
  vif = pd.DataFrame()
  vif['vif_factor'] = [variance_inflation_factor(x.values, i)
                      for i in range(x.shape[1])]
  vif['feature'] = x.columns
  return vif
vif = feature_vif(x)
print(vif)

   vif_factor            feature
0    3.458256         updownhill
1    4.402870  curve_all_section
2    1.512849      ratio_hairpin
3    1.300164      straight_jump


### 변수 하나씩 추가해보기

In [32]:
# 오르막내리막, 전체 곡선 개수, 헤어핀 비율, 점프 구간 + 전체 직선 개수
# 수정 R제곱 0.767 AIC 2032 BIC 2046

ols('AVG_record ~ updownhill + curve_all_section + ratio_hairpin + straight_jump + straight_all_section', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.78
Model:,OLS,Adj. R-squared:,0.767
Method:,Least Squares,F-statistic:,58.14
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,1.58e-25
Time:,02:27:57,Log-Likelihood:,-1009.8
No. Observations:,88,AIC:,2032.0
Df Residuals:,82,BIC:,2046.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5840.4709,1e+04,0.582,0.562,-1.41e+04,2.58e+04
updownhill,1525.8390,332.197,4.593,0.000,864.994,2186.684
curve_all_section,1975.5226,174.664,11.310,0.000,1628.060,2322.985
ratio_hairpin,3.794e+04,1.37e+04,2.760,0.007,1.06e+04,6.53e+04
straight_jump,1.176e+04,5782.244,2.033,0.045,255.401,2.33e+04
straight_all_section,424.6191,106.971,3.969,0.000,211.819,637.419

0,1,2,3
Omnibus:,25.976,Durbin-Watson:,1.959
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.42
Skew:,1.132,Prob(JB):,3.06e-11
Kurtosis:,5.843,Cond. No.,433.0


In [33]:
# ❌오르막내리막, 전체 곡선 개수, 헤어핀 비율, 점프 구간, 전체 직선 개수 + 감속 트리거
# 수정 R제곱 0.774 AIC 2030 BIC 2047  - 감속트리거 p-value 높음

ols('AVG_record ~ updownhill + curve_all_section + ratio_hairpin + straight_jump + trigger_decel + straight_all_section', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.789
Model:,OLS,Adj. R-squared:,0.774
Method:,Least Squares,F-statistic:,50.59
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,2.23e-25
Time:,02:27:57,Log-Likelihood:,-1007.9
No. Observations:,88,AIC:,2030.0
Df Residuals:,81,BIC:,2047.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6358.6104,9884.934,0.643,0.522,-1.33e+04,2.6e+04
updownhill,1605.5108,329.729,4.869,0.000,949.454,2261.568
curve_all_section,1886.2645,178.270,10.581,0.000,1531.563,2240.966
ratio_hairpin,3.453e+04,1.37e+04,2.530,0.013,7369.680,6.17e+04
straight_jump,1.151e+04,5694.069,2.022,0.046,183.049,2.28e+04
trigger_decel,1570.1881,827.218,1.898,0.061,-75.716,3216.092
straight_all_section,388.8071,106.989,3.634,0.000,175.932,601.682

0,1,2,3
Omnibus:,31.287,Durbin-Watson:,1.884
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.639
Skew:,1.302,Prob(JB):,3.39e-15
Kurtosis:,6.376,Cond. No.,436.0


In [34]:
# ❌오르막내리막, 전체 곡선 개수, 헤어핀 비율, 점프 구간, 전체 직선 개수 + 도로폭 난이도
# 수정 R제곱 0.770 AIC 2031 BIC 2048  - R제곱 하락, AIC BIC 상승, 도로폭 난이도 P-value 높음

ols('AVG_record ~ updownhill + curve_all_section + ratio_hairpin + straight_jump + straight_all_section + mean_road_level', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.786
Model:,OLS,Adj. R-squared:,0.77
Method:,Least Squares,F-statistic:,49.65
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,4.02e-25
Time:,02:27:57,Log-Likelihood:,-1008.5
No. Observations:,88,AIC:,2031.0
Df Residuals:,81,BIC:,2048.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6023.7186,1.26e+04,-0.478,0.634,-3.11e+04,1.9e+04
updownhill,1641.7602,337.976,4.858,0.000,969.293,2314.227
curve_all_section,1976.4138,173.229,11.409,0.000,1631.743,2321.085
ratio_hairpin,3.812e+04,1.36e+04,2.796,0.006,1.1e+04,6.52e+04
straight_jump,1.141e+04,5739.169,1.988,0.050,-9.464,2.28e+04
straight_all_section,422.7352,106.099,3.984,0.000,211.632,633.838
mean_road_level,7760.5992,5045.913,1.538,0.128,-2279.185,1.78e+04

0,1,2,3
Omnibus:,22.743,Durbin-Watson:,1.932
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.912
Skew:,1.014,Prob(JB):,2.15e-09
Kurtosis:,5.602,Cond. No.,451.0


In [44]:
# 오르막내리막, 전체 곡선 개수, 헤어핀 비율, 점프 구간, 전체 직선 개수, 감속트리거, 트랙 이동
# 수정 R제곱 0.780 AIC 2028 BIC 2048

ols('AVG_record ~ updownhill + curve_all_section + ratio_hairpin + straight_jump + trigger_decel + straight_all_section + trigger_warp', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.798
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,45.15
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,3.19e-25
Time:,02:30:08,Log-Likelihood:,-1006.0
No. Observations:,88,AIC:,2028.0
Df Residuals:,80,BIC:,2048.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2101.2582,1e+04,0.210,0.834,-1.78e+04,2.2e+04
updownhill,1609.1060,324.903,4.953,0.000,962.528,2255.684
curve_all_section,1933.6601,177.514,10.893,0.000,1580.397,2286.924
ratio_hairpin,3.483e+04,1.35e+04,2.589,0.011,8058.077,6.16e+04
straight_jump,1.23e+04,5626.580,2.185,0.032,1098.868,2.35e+04
trigger_decel,1579.8902,815.114,1.938,0.056,-42.237,3202.018
straight_all_section,385.9419,105.433,3.661,0.000,176.124,595.760
trigger_warp,8553.7792,4620.618,1.851,0.068,-641.544,1.77e+04

0,1,2,3
Omnibus:,38.675,Durbin-Watson:,1.929
Prob(Omnibus):,0.0,Jarque-Bera (JB):,104.775
Skew:,1.502,Prob(JB):,1.7699999999999998e-23
Kurtosis:,7.421,Cond. No.,438.0


### 파생지표를 개별 지표로 바꿔보기

In [45]:
#  ❌오르막길 개수, 내리막길 개수, 예각 커브 개수, 헤어핀 개수, 점프 구간, 전체 직선 개수, 도로폭 난이도
# 수정 R제곱 0.785 AIC 2026 BIC 2046  - 점프 구간 p-value 높음 (0.071), 도로폭 난이도 p-value 높음 (0.069)

ols('AVG_record ~ straight_uphill + straight_downhill + curve_all_section + curve_hairpin + straight_jump + straight_all_section + mean_road_level', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.803
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,46.51
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,1.2599999999999999e-25
Time:,02:30:40,Log-Likelihood:,-1005.0
No. Observations:,88,AIC:,2026.0
Df Residuals:,80,BIC:,2046.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4264.9659,1.22e+04,0.350,0.728,-2e+04,2.85e+04
straight_uphill,1704.1991,517.430,3.294,0.001,674.481,2733.917
straight_downhill,1654.4826,515.472,3.210,0.002,628.661,2680.304
curve_all_section,1631.3394,193.671,8.423,0.000,1245.922,2016.757
curve_hairpin,1936.3414,504.535,3.838,0.000,932.285,2940.398
straight_jump,1.016e+04,5559.768,1.827,0.071,-907.603,2.12e+04
straight_all_section,344.2766,109.797,3.136,0.002,125.774,562.779
mean_road_level,9024.3114,4892.348,1.845,0.069,-711.771,1.88e+04

0,1,2,3
Omnibus:,20.07,Durbin-Watson:,1.903
Prob(Omnibus):,0.0,Jarque-Bera (JB):,32.787
Skew:,0.927,Prob(JB):,7.59e-08
Kurtosis:,5.346,Cond. No.,394.0


In [46]:
# 오르막길 개수, 내리막길 개수, 예각 커브 개수, 헤어핀 개수, 둔각 커브 개수, 전체 직선 개수, 도로폭 난이도
# 수정 R제곱 0.791 AIC 2024 BIC 2043  - 끊어진 구간 p-value 상승 (0.061)

ols('AVG_record ~ straight_uphill + straight_downhill + curve_acute + curve_hairpin + curve_obtuse + straight_all_section + mean_road_level', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.808
Model:,OLS,Adj. R-squared:,0.791
Method:,Least Squares,F-statistic:,48.03
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,4.5300000000000004e-26
Time:,02:31:47,Log-Likelihood:,-1003.8
No. Observations:,88,AIC:,2024.0
Df Residuals:,80,BIC:,2043.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-3636.5526,1.24e+04,-0.294,0.769,-2.82e+04,2.1e+04
straight_uphill,1832.2592,513.859,3.566,0.001,809.648,2854.870
straight_downhill,1522.2545,507.267,3.001,0.004,512.762,2531.747
curve_acute,2322.5438,317.457,7.316,0.000,1690.784,2954.304
curve_hairpin,3772.1517,430.604,8.760,0.000,2915.223,4629.081
curve_obtuse,1187.8597,291.824,4.070,0.000,607.111,1768.608
straight_all_section,342.8603,108.388,3.163,0.002,127.162,558.558
mean_road_level,1.057e+04,4851.425,2.179,0.032,914.518,2.02e+04

0,1,2,3
Omnibus:,24.189,Durbin-Watson:,1.905
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43.703
Skew:,1.066,Prob(JB):,3.24e-10
Kurtosis:,5.716,Cond. No.,383.0


In [38]:
# 오르막길 개수, 내리막길 개수, 예각 커브 개수, 헤어핀 개수, 둔각 커브 개수, 둔각 지름길, 펜스 없는 구간 유무, 고정 장애물 개수
# 수정 R제곱 0.815 AIC 2015 BIC 2040

ols('AVG_record ~ straight_uphill + straight_downhill + curve_acute + curve_hairpin + curve_obtuse + straight_all_section + shortcut_obtuse + C(fence_exist) + obstacle_fixed', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.834
Model:,OLS,Adj. R-squared:,0.815
Method:,Least Squares,F-statistic:,43.51
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,8.200000000000001e-27
Time:,02:27:58,Log-Likelihood:,-997.41
No. Observations:,88,AIC:,2015.0
Df Residuals:,78,BIC:,2040.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.047e+04,9880.548,1.060,0.293,-9200.405,3.01e+04
C(fence_exist)[T.1],1.053e+04,5116.157,2.057,0.043,339.627,2.07e+04
straight_uphill,1986.0396,485.898,4.087,0.000,1018.691,2953.388
straight_downhill,1048.1720,481.449,2.177,0.032,89.680,2006.664
curve_acute,1875.6836,316.966,5.918,0.000,1244.653,2506.714
curve_hairpin,3773.3127,413.584,9.123,0.000,2949.930,4596.696
curve_obtuse,961.8683,289.773,3.319,0.001,384.974,1538.763
straight_all_section,351.8868,104.591,3.364,0.001,143.663,560.111
shortcut_obtuse,6773.9447,3110.407,2.178,0.032,581.599,1.3e+04

0,1,2,3
Omnibus:,24.795,Durbin-Watson:,1.837
Prob(Omnibus):,0.0,Jarque-Bera (JB):,43.852
Skew:,1.107,Prob(JB):,3e-10
Kurtosis:,5.657,Cond. No.,318.0


In [39]:
# 오르막길 개수, 내리막길 개수, 예각 커브 개수, 헤어핀 개수, 둔각 커브 개수, 둔각 지름길, 펜스 없는 구간 유무, 전체 직선 개수
# 수정 R제곱 0.811 AIC 2016 BIC 2041 - 하락

ols('AVG_record ~ straight_uphill + straight_downhill + curve_acute + curve_hairpin + curve_obtuse + straight_all_section + mean_road_level + shortcut_obtuse + C(fence_exist)', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.831
Model:,OLS,Adj. R-squared:,0.811
Method:,Least Squares,F-statistic:,42.6
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,1.61e-26
Time:,02:27:58,Log-Likelihood:,-998.18
No. Observations:,88,AIC:,2016.0
Df Residuals:,78,BIC:,2041.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,896.5668,1.19e+04,0.075,0.940,-2.29e+04,2.47e+04
C(fence_exist)[T.1],1.06e+04,5166.769,2.052,0.044,314.089,2.09e+04
straight_uphill,2063.7579,494.006,4.178,0.000,1080.267,3047.248
straight_downhill,1190.8619,492.780,2.417,0.018,209.811,2171.912
curve_acute,2017.1061,319.227,6.319,0.000,1381.574,2652.638
curve_hairpin,3669.9870,413.528,8.875,0.000,2846.717,4493.257
curve_obtuse,969.5240,292.475,3.315,0.001,387.250,1551.798
straight_all_section,319.0087,105.209,3.032,0.003,109.553,528.465
mean_road_level,1.027e+04,4658.546,2.205,0.030,995.429,1.95e+04

0,1,2,3
Omnibus:,16.599,Durbin-Watson:,1.955
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.673
Skew:,0.832,Prob(JB):,7.24e-06
Kurtosis:,4.92,Cond. No.,389.0


### 오르막길/내리막길 개수 -> 내리막길 비율, 직선 개수 -> 직선 비율로 변경해보기

In [50]:
# ❌오르막길 개수, 내리막길 개수 -> 내리막길 비율로 변경
# 수정 R제곱 0.810 AIC 1702 BIC 1720 - AIC, BIC가 대폭 하락했으나 obstacle_fixed p-value 상승

ols('AVG_record ~ ratio_downhill + curve_acute + curve_hairpin + curve_obtuse + straight_all_section + C(fence_exist) + obstacle_fixed', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.828
Model:,OLS,Adj. R-squared:,0.81
Method:,Least Squares,F-statistic:,45.42
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,7.879999999999999e-23
Time:,02:40:34,Log-Likelihood:,-842.87
No. Observations:,74,AIC:,1702.0
Df Residuals:,66,BIC:,1720.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9172.0850,9886.271,0.928,0.357,-1.06e+04,2.89e+04
C(fence_exist)[T.1],1.417e+04,5670.412,2.500,0.015,2852.063,2.55e+04
ratio_downhill,1868.7835,520.034,3.594,0.001,830.502,2907.065
curve_acute,2007.9878,355.214,5.653,0.000,1298.780,2717.196
curve_hairpin,4578.9751,505.332,9.061,0.000,3570.048,5587.902
curve_obtuse,1057.4778,303.613,3.483,0.001,451.295,1663.661
straight_all_section,388.2829,114.716,3.385,0.001,159.245,617.321
obstacle_fixed,913.6164,509.106,1.795,0.077,-102.847,1930.080

0,1,2,3
Omnibus:,17.012,Durbin-Watson:,1.695
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.695
Skew:,0.979,Prob(JB):,1.18e-05
Kurtosis:,4.878,Cond. No.,269.0


In [51]:
# 직선 구간 개수 -> 직선 구간 비율로 변경
# 수정 R제곱 0.829 AIC 2007 BIC 2029

ols('AVG_record ~ straight_uphill + straight_downhill + curve_acute + curve_hairpin + curve_obtuse + ratio_straight + C(fence_exist) + obstacle_fixed', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.845
Model:,OLS,Adj. R-squared:,0.829
Method:,Least Squares,F-statistic:,53.9
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,7.280000000000001e-29
Time:,02:41:04,Log-Likelihood:,-994.31
No. Observations:,88,AIC:,2007.0
Df Residuals:,79,BIC:,2029.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-7.057e+04,2.08e+04,-3.386,0.001,-1.12e+05,-2.91e+04
C(fence_exist)[T.1],1.069e+04,4896.151,2.182,0.032,940.182,2.04e+04
straight_uphill,1721.3607,462.876,3.719,0.000,800.029,2642.693
straight_downhill,1253.3889,450.751,2.781,0.007,356.191,2150.587
curve_acute,2615.1868,328.796,7.954,0.000,1960.734,3269.639
curve_hairpin,4713.1529,401.641,11.735,0.000,3913.707,5512.599
curve_obtuse,1777.9009,298.158,5.963,0.000,1184.433,2371.369
ratio_straight,1.2e+05,2.32e+04,5.175,0.000,7.38e+04,1.66e+05
obstacle_fixed,1078.7716,412.054,2.618,0.011,258.600,1898.944

0,1,2,3
Omnibus:,24.347,Durbin-Watson:,1.738
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44.714
Skew:,1.064,Prob(JB):,1.95e-10
Kurtosis:,5.768,Cond. No.,344.0


# 최종 모델

In [49]:
# 오르막길 개수, 내리막길 개수 -> 내리막길 비율 / 직선 구간 개수 -> 직선 구간 비율
# 수정 R제곱 0.832 AIC 1692 BIC 1711

ols('AVG_record ~ ratio_downhill + curve_acute + curve_hairpin + curve_obtuse + ratio_straight + C(fence_exist) + obstacle_fixed', df_track).fit().summary()

0,1,2,3
Dep. Variable:,AVG_record,R-squared:,0.848
Model:,OLS,Adj. R-squared:,0.832
Method:,Least Squares,F-statistic:,52.76
Date:,"Fri, 25 Nov 2022",Prob (F-statistic):,1.32e-24
Time:,02:37:56,Log-Likelihood:,-838.22
No. Observations:,74,AIC:,1692.0
Df Residuals:,66,BIC:,1711.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6.196e+04,2.14e+04,-2.890,0.005,-1.05e+05,-1.92e+04
C(fence_exist)[T.1],1.347e+04,5304.008,2.539,0.013,2875.999,2.41e+04
ratio_downhill,1761.5126,489.548,3.598,0.001,784.097,2738.928
curve_acute,2603.9692,353.988,7.356,0.000,1897.210,3310.728
curve_hairpin,5346.4972,498.902,10.717,0.000,4350.408,6342.586
curve_obtuse,1654.3477,320.896,5.155,0.000,1013.658,2295.038
ratio_straight,1.164e+05,2.49e+04,4.671,0.000,6.66e+04,1.66e+05
obstacle_fixed,941.0335,478.026,1.969,0.053,-13.375,1895.442

0,1,2,3
Omnibus:,15.437,Durbin-Watson:,1.755
Prob(Omnibus):,0.0,Jarque-Bera (JB):,20.758
Skew:,0.879,Prob(JB):,3.11e-05
Kurtosis:,4.908,Cond. No.,326.0


### 다중 공선성 체크
최종 모델 다중 공선성 문제 없음

In [58]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

x = df_track[['ratio_downhill', 'curve_acute', 'curve_hairpin', 'curve_obtuse',
             'ratio_straight', 'fence_exist', 'obstacle_fixed']]

def feature_vif(x):
  vif = pd.DataFrame()
  vif['vif_factor'] = [variance_inflation_factor(x.values, i)
                      for i in range(x.shape[1])]
  vif['feature'] = x.columns
  return vif
vif = feature_vif(x)
print(vif)

   vif_factor         feature
0    2.779617  ratio_downhill
1    5.365317     curve_acute
2    1.834568   curve_hairpin
3    2.619883    curve_obtuse
4    3.445757  ratio_straight
5    1.992630     fence_exist
6    1.989083  obstacle_fixed
