In [15]:
import glob
import pandas as pd
from scipy.stats import pearsonr
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from itertools import combinations

In [23]:
range_battery = pd.read_csv('preprocess/evdata2.csv')
oil = pd.read_csv('preprocess/oil.csv', encoding='cp949')
energy = pd.read_csv('preprocess/energy.csv',encoding='cp949')
range_temp = pd.read_csv('preprocess/range.csv',encoding='cp949')

In [24]:
def pair_pearson(df: pd.DataFrame, cols=None):
    if cols is None:
        cols = df.select_dtypes(include='number').columns.tolist()
    results = []
    for x, y in combinations(cols, 2):
        r, p = pearsonr(df[x].dropna(), df[y].dropna())
        results.append({
            'Variable 1': x,
            'Variable 2': y,
            'Pearson r': format(r, f".4f"),
            'p-value': format(p, f".4f")
        })
    
    return pd.DataFrame(results)

def calculate_vif(v):
    X = sm.add_constant(v)
    vif_df = pd.DataFrame({
        'variable': X.columns,
        'VIF': [variance_inflation_factor(X.values, i)
                for i in range(X.shape[1])]
    })
    return vif_df

In [26]:

print("전체 상관 계수\n")
print(pair_pearson(range_battery))
print('\n\n')

print("전체 vif\n")
print(calculate_vif(range_battery[['Range(km)', 'Efficiency(Wh/km)', 'Battery(kWh)']]))

전체 상관 계수

          Variable 1         Variable 2 Pearson r p-value
0          Range(km)  Efficiency(Wh/km)    0.0052  0.9625
1          Range(km)       Battery(kWh)    0.8088  0.0000
2  Efficiency(Wh/km)       Battery(kWh)    0.5865  0.0000



전체 vif



TypeError: 'DataFrame' object is not callable

In [None]:
print("'Range(km)', 'Efficiency(Wh/km)'\n")
temp = range_df[['Range(km)', 'Efficiency(Wh/km)']]
print(print(calculate_vif(temp)))
print('\n\n')

print("'Range(km)', 'Battery(kWh)'\n")
temp = range_df[['Range(km)', 'Battery(kWh)']]
print(print(calculate_vif(temp)))
print('\n\n')

print("'Efficiency(Wh/km)', 'Battery(kWh)'\n")
temp = range_df[['Efficiency(Wh/km)', 'Battery(kWh)']]
print(print(calculate_vif(temp)))

'Range(km)', 'Efficiency(Wh/km)'



TypeError: 'DataFrame' object is not callable

In [None]:
print(pair_pearson(oil[['보통휘발유','자동차용경유']]))
print(calculate_vif(oil[['보통휘발유','자동차용경유']]))

  Variable 1 Variable 2 Pearson r p-value
0      보통휘발유     자동차용경유    0.9195  0.0000
  variable         VIF
0    const  164.963431
1    보통휘발유    6.475106
2   자동차용경유    6.475106


In [None]:
electric = energy[energy['연료'] == '전기'].copy()
electric = electric[['표시연비', '등급']]
grade_num = electric['등급'].str.extract(r'(\d+)')[0]
non_null = grade_num.notna()
electric = electric[non_null].copy()
electric['등급'] = grade_num[non_null].astype(int)
print(electric)

      표시연비  등급
5      4.1   4
6      6.1   1
9      4.1   4
10     4.1   4
11     4.1   4
...    ...  ..
3288   5.6   2
3342   5.1   2
3645   5.5   2
3726   3.5   4
3742   1.6   5

[434 rows x 2 columns]


In [None]:
print(pair_pearson(electric))
print(calculate_vif(electric))

  Variable 1 Variable 2 Pearson r p-value
0       표시연비         등급   -0.9542  0.0000
  variable         VIF
0    const  763.895262
1     표시연비   11.180516
2       등급   11.180516
