<a href="https://colab.research.google.com/github/LeeMinJun0102/Car_Price_Predict/blob/main/ML_test1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import matplotlib.font_manager as fm
import ast

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, MultiLabelBinarizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import plot_tree, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_excel('Final.xlsx')
df.info()

In [None]:
df.head()

In [None]:
X = df[['manufacturer_grouping',
        'car_type_grouping',
        'mileage',
        'color_grouping',
        'fuel_grouping',
        'transmission_grouping',
        'purchase_accident',
        'sales_channel_grouping',
        'purchase_channel_1_grouping',
        'new_car_price',
        'usedyear'
       ]]

y = df['residual_value_rate']
X.head()

In [None]:
# train/test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# ColumnTransformer 및 Pipeline 구성
categorical_features = [ 'manufacturer_grouping', 'car_type_grouping', 'color_grouping', 'fuel_grouping', 'transmission_grouping', 'purchase_accident', 'sales_channel_grouping', 'purchase_channel_1_grouping']  # 범주형 변수 지정

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # 수치형은 그대로 통과
)

# Random Forest

In [None]:
# 파이프라인 구성
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])
# 25 5 5
# GridSearchCV를 통한 모델 튜닝
param_grid = {
    'model__max_depth': [10,15,20,25,30],
    'model__min_samples_split':[5,6,7,8,9,10],
    'model__min_samples_leaf': [5,6,7,8,9,10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 결과 출력
print("Best Params:", grid_search.best_params_)
print("Best CV RMSE (음수):", grid_search.best_score_)  # 음수로 나옴

# 테스트 성능 확인
best_model = grid_search.best_estimator_
train_pred = best_model.predict(X_train)
test_pred = best_model.predict(X_test)

# 훈련 데이터 R² 계산
train_mse = mean_squared_error(y_train, train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(y_test, test_pred)
test_rmse = np.sqrt(test_mse)
train_r2 = r2_score(y_train, train_pred)
test_2 = r2_score(y_test, test_pred)
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R² Score:", train_r2)
print("Test R² Score:", test_2)

In [None]:
# 파이프라인에서 랜덤포레스트 모델 추출
rf_model = best_model.named_steps['model']

# 개별 트리 수
n_estimators = len(rf_model.estimators_)
print(f"트리 개수: {n_estimators}")

# 각 트리의 리프 노드 수, 전체 노드 수, 깊이
leaf_counts = [tree.get_n_leaves() for tree in rf_model.estimators_]
node_counts = [tree.tree_.node_count for tree in rf_model.estimators_]
depths = [tree.get_depth() for tree in rf_model.estimators_]

# 평균 및 최대값 출력
print(f"평균 리프 노드 수: {np.mean(leaf_counts):.2f}")
print(f"최대 리프 노드 수: {np.max(leaf_counts)}")

print(f"평균 전체 노드 수: {np.mean(node_counts):.2f}")
print(f"최대 전체 노드 수: {np.max(node_counts)}")

print(f"평균 트리 깊이: {np.mean(depths):.2f}")
print(f"최대 트리 깊이: {np.max(depths)}")

In [None]:
# 전처리 이후 실제 feature 이름 추출
ohe = best_model.named_steps['preprocessor'].named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(categorical_features)
num_features = [col for col in X.columns if col not in categorical_features]
final_feature_names = np.concatenate([ohe_feature_names, num_features])

# 모델에서 첫 번째 트리 추출
rf_model = best_model.named_steps['model']

# 트리 시각화 (깊이 제한)
plt.figure(figsize=(20, 10))
plot_tree(
    rf_model.estimators_[0],
    feature_names=final_feature_names,
    class_names=['No Default', 'Default'],
    filled=True,
    rounded=True,
    max_depth=2
)
plt.title("Tree #0 from Random Forest (max_depth=3)")
plt.show()


In [None]:
# 전처리 후 피처 이름 가져오기
onehot_features = grid_search.best_estimator_.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()
numeric_features = [col for col in X.columns if col not in categorical_features]
all_features = list(onehot_features) + numeric_features

importances = best_model.named_steps['model'].feature_importances_
feat_importance = pd.Series(importances, index=all_features).sort_values(ascending=False)
top20_features = feat_importance.head(20)
print(top20_features)
# plt.figure(figsize=(10,10))
# sns.barplot(x=feat_importance, y=feat_importance.index)
# plt.title('Feature Importance')
# plt.xlabel('Importance score')
# plt.tight_layout()
# plt.show()

# Decision Tree

In [None]:
# 파이프라인 구성
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

# GridSearchCV를 통한 모델 튜닝
param_grid = {
    'model__max_depth': [10,15,20,25,30],
    'model__min_samples_split': [5,6,7,8,9,10],
    'model__min_samples_leaf': [5,6,7,8,9,10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 결과 출력
print("Best Params:", grid_search.best_params_)
print("Best CV RMSE (음수):", grid_search.best_score_)  # 음수로 나옴

# 테스트 성능 확인
best_model = grid_search.best_estimator_
train_pred = best_model.predict(X_train)
test_pred = best_model.predict(X_test)

# 훈련 데이터 R² 계산
train_mse = mean_squared_error(y_train, train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(y_test, test_pred)
test_rmse = np.sqrt(test_mse)
train_r2 = r2_score(y_train, train_pred)
test_2 = r2_score(y_test, test_pred)
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R² Score:", train_r2)
print("Test R² Score:", test_2)

In [None]:
# 변환된 전체 feature 이름 가져오기
feature_names_transformed = best_model.named_steps['preprocessor'].get_feature_names_out()

# 트리 시각화
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(
    best_model.named_steps['model'],
    filled=True,
    feature_names=feature_names_transformed,
    rounded=True,
    max_depth=2,
    fontsize=8
)
plt.show()

In [None]:
# 변수 중요도 계산
importances = best_model.named_steps['model'].feature_importances_
feature_names_transformed = best_model.named_steps['preprocessor'].get_feature_names_out()

# 데이터프레임 생성 및 정렬
importance_df = pd.DataFrame({
    'feature': feature_names_transformed,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# 데이터프레임 확인
print(importance_df)

# 시각화 (상위 20개)
top_n = 20
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'][:top_n][::-1],  # 역순으로
         importance_df['importance'][:top_n][::-1])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Top 20 Feature Importances")
plt.tight_layout()
plt.show()

# XGB

In [None]:
# 파이프라인 구성
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))

])

# GridSearchCV를 통한 모델 튜닝
param_grid = {
    'model__max_depth': [5,10,15,20,25,30],
    'model__min_child_weight': [1, 3, 5],  # XGB에서 자주 조정하는 파라미터 예시
    'model__learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 결과 출력
print("Best Params:", grid_search.best_params_)
print("Best CV RMSE (음수):", grid_search.best_score_)  # 음수로 나옴

# 테스트 성능 확인
best_model = grid_search.best_estimator_
train_pred = best_model.predict(X_train)
test_pred = best_model.predict(X_test)

# 훈련 데이터 R² 계산
train_mse = mean_squared_error(y_train, train_pred)
train_rmse = np.sqrt(train_mse)
test_mse = mean_squared_error(y_test, test_pred)
test_rmse = np.sqrt(test_mse)
train_r2 = r2_score(y_train, train_pred)
test_2 = r2_score(y_test, test_pred)
print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R² Score:", train_r2)
print("Test R² Score:", test_2)

In [None]:
# XGB 모델 꺼내기
xgb_model = best_model.named_steps['model']

# 학습된 preprocessor에서 최종 컬럼명 얻기
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# 중요도 데이터프레임 만들기
importances = xgb_model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(importance_df)