In [1]:
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from itertools import combinations

In [2]:
range_battery = pd.read_csv('evdata2.csv')
oil = pd.read_csv('oil.csv', encoding='cp949')
energy = pd.read_csv('energy.csv',encoding='cp949')
ev_certification = pd.read_csv('ev_range.csv',encoding='cp949')
charging = pd.read_csv('charging.csv', encoding ='cp949')

In [3]:
def pair_pearson(df: pd.DataFrame, cols=None):
    if cols is None:
        cols = df.select_dtypes(include='number').columns.tolist()
    results = []
    for x, y in combinations(cols, 2):
        r, p = pearsonr(df[x].dropna(), df[y].dropna())
        results.append({
            'Variable 1': x,
            'Variable 2': y,
            'Pearson r': format(r, f".4f"),
            'p-value': format(p, f".4f")
        })
    df = pd.DataFrame(results)
    return df

def calculate_vif(v):
    X = sm.add_constant(v)
    vif_df = pd.DataFrame({
        'variable': X.columns,
        'VIF': [variance_inflation_factor(X.values, i)
                for i in range(X.shape[1])]
    })
    return vif_df

### 배터리, 주행거리, 연비
---------------------------------

In [4]:
print(pair_pearson(range_battery))

          Variable 1         Variable 2 Pearson r p-value
0          Range(km)  Efficiency(Wh/km)    0.0052  0.9625
1          Range(km)       Battery(kWh)    0.8088  0.0000
2  Efficiency(Wh/km)       Battery(kWh)    0.5865  0.0000


In [5]:
print(calculate_vif(range_battery[['Range(km)', 'Efficiency(Wh/km)', 'Battery(kWh)']]))

            variable          VIF
0              const  2995.530933
1          Range(km)    97.096102
2  Efficiency(Wh/km)    51.175791
3       Battery(kWh)   147.995872


In [6]:
calculate_vif(range_battery[['Range(km)', 'Efficiency(Wh/km)']])

Unnamed: 0,variable,VIF
0,const,90.832829
1,Range(km),1.000027
2,Efficiency(Wh/km),1.000027


In [7]:
calculate_vif(range_battery[['Range(km)', 'Battery(kWh)']])

Unnamed: 0,variable,VIF
0,const,28.681194
1,Range(km),2.89199
2,Battery(kWh),2.89199


In [8]:
calculate_vif(range_battery[['Efficiency(Wh/km)', 'Battery(kWh)']])

Unnamed: 0,variable,VIF
0,const,63.740109
1,Efficiency(Wh/km),1.524262
2,Battery(kWh),1.524262


### 휘발유 경유
---------------

In [9]:
pair_pearson(oil[['보통휘발유','자동차용경유']])

Unnamed: 0,Variable 1,Variable 2,Pearson r,p-value
0,보통휘발유,자동차용경유,0.9195,0.0


In [10]:
calculate_vif(oil[['보통휘발유','자동차용경유']])

Unnamed: 0,variable,VIF
0,const,164.963431
1,보통휘발유,6.475106
2,자동차용경유,6.475106


### 연비, 효율 등급
-------------------------------


In [11]:
electric = energy[energy['연료'] == '전기'].copy()
electric = electric[['표시연비', '등급']]
grade_num = electric['등급'].str.extract(r'(\d+)')[0]
non_null = grade_num.notna()
electric = electric[non_null].copy()
electric['등급'] = grade_num[non_null].astype(int)
electric

Unnamed: 0,표시연비,등급
5,4.1,4
6,6.1,1
9,4.1,4
10,4.1,4
11,4.1,4
...,...,...
3288,5.6,2
3342,5.1,2
3645,5.5,2
3726,3.5,4


In [12]:
pair_pearson(electric)

Unnamed: 0,Variable 1,Variable 2,Pearson r,p-value
0,표시연비,등급,-0.9542,0.0


In [13]:
calculate_vif(electric)

Unnamed: 0,variable,VIF
0,const,763.895262
1,표시연비,11.180516
2,등급,11.180516


### 주행거리 상온, 저온
---------------------

In [14]:
ev_range = ev_certification[
    (ev_certification['사용연료'] == '전기(EV)') &
    (ev_certification['인증구분'] == '신규인증') &
    (ev_certification['1회충전주행거리(km)\n상온 도심'] != '') &
    (ev_certification['1회충전주행거리(km)\n저온 도심'] != '')
].copy()

ev_range = ev_range[['1회충전주행거리(km)\n상온 도심', '1회충전주행거리(km)\n저온 도심']].dropna()

ev_range

Unnamed: 0,1회충전주행거리(km)\n상온 도심,1회충전주행거리(km)\n저온 도심
4,209.0,157.0
7,53.1,41.0
8,85.4,69.3
9,49.1,44.6
10,438.0,254.0
...,...,...
496,414.0,339.0
497,476.0,363.0
498,453.0,351.0
499,251.0,192.0


In [15]:
pair_pearson(ev_range)

Unnamed: 0,Variable 1,Variable 2,Pearson r,p-value
0,1회충전주행거리(km)\n상온 도심,1회충전주행거리(km)\n저온 도심,0.9791,0.0


In [16]:
calculate_vif(ev_range)

Unnamed: 0,variable,VIF
0,const,5.525047
1,1회충전주행거리(km)\n상온 도심,24.195647
2,1회충전주행거리(km)\n저온 도심,24.195647


##### 상온, 저온/상온 비율
-------------------

In [17]:
ev_range2 = pd.DataFrame({
    '상온도심(km)': ev_range['1회충전주행거리(km)\n상온 도심'],
    '저온도심/상온도심 비율': (
        ev_range['1회충전주행거리(km)\n저온 도심'] 
        / ev_range['1회충전주행거리(km)\n상온 도심']
    )
})

In [18]:
pair_pearson(ev_range2)

Unnamed: 0,Variable 1,Variable 2,Pearson r,p-value
0,상온도심(km),저온도심/상온도심 비율,-0.4203,0.0


In [19]:
calculate_vif(ev_range2)

Unnamed: 0,variable,VIF
0,const,139.941562
1,상온도심(km),1.214527
2,저온도심/상온도심 비율,1.214527


### 충전소 유형
---------------------------


In [20]:
pair_pearson(charging)

Unnamed: 0,Variable 1,Variable 2,Pearson r,p-value
0,완속,급속,0.9923,0.0


In [21]:
calculate_vif(charging)

Unnamed: 0,variable,VIF
0,const,14.46188
1,완속,65.425269
2,급속,65.425269
