In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [4]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [5]:
display(train_data.shape)
display(test_data.shape)

(11504798, 12)

(7669866, 11)

# Data Preprocessing

In [6]:
# 필요 없는 변수 제거
train_data.drop(['id'], axis=1, inplace=True)
test_data.drop(['id'], axis=1, inplace=True)

In [7]:
# 범주형 변수 object로 변환
# 범주형 데이터 int -> object로 변환
train_data['Driving_License'] = train_data['Driving_License'].astype('object')
train_data['Region_Code'] = train_data['Region_Code'].astype('object')
train_data['Previously_Insured'] = train_data['Previously_Insured'].astype('object')
train_data['Policy_Sales_Channel'] = train_data['Policy_Sales_Channel'].astype('object')
#train_data['Response'] = train_data['Response'].astype('object')

test_data['Driving_License'] = test_data['Driving_License'].astype('object')
test_data['Region_Code'] = test_data['Region_Code'].astype('object')
test_data['Previously_Insured'] = test_data['Previously_Insured'].astype('object')
test_data['Policy_Sales_Channel'] = test_data['Policy_Sales_Channel'].astype('object')

In [8]:
# 데이터 분할
x_train = train_data.drop(['Response'], axis=1)
y_train = train_data['Response']
x_test = test_data

# 범주형 데이터 더미처리
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [9]:
# 학습 데이터와 테스트 데이터에서 동일한 컬럼을 유지
x_train, x_test = x_train.align(x_test, join='outer', axis=1, fill_value=0)

In [10]:
# 데이터 샘플링 (예: 10% 데이터만 사용)
sample_fraction = 0.01  # 샘플 비율 (1%)
x_sample, y_sample = x_train[:int(sample_fraction * len(x_train))], y_train[:int(sample_fraction * len(y_train))]


In [9]:
# 모델 불러오기
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


# 모델 정의
model = RandomForestClassifier()

# Val DataSet으로 성능 확인
x_tr, x_val, y_tr, y_val = train_test_split(x_sample, y_sample, test_size=0.3, random_state=42)

# 모델 학습
model.fit(x_tr, y_tr)


In [10]:
# 예측
y_pred = model.predict(x_val)

# 성능 확인
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
#display(accuracy_score(y_val, y_pred))
#display(f1_score(y_val, y_pred))
#display(recall_score(y_val, y_pred))
#display(precision_score(y_val, y_pred))
display(roc_auc_score(y_val, y_pred))

0.5579392189063478

In [1]:
from pycaret.classification import *

In [2]:
import mlflow

In [11]:
pycaretData = pd.concat([x_sample, y_sample], axis=1)

In [12]:
pycaretData

Unnamed: 0,Age,Annual_Premium,Driving_License_0,Driving_License_1,Gender_Female,Gender_Male,Policy_Sales_Channel_1.0,Policy_Sales_Channel_10.0,Policy_Sales_Channel_100.0,Policy_Sales_Channel_101.0,...,Region_Code_7.0,Region_Code_8.0,Region_Code_9.0,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_No,Vehicle_Damage_Yes,Vintage,Response
0,21,65101.0,False,True,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,187,0
1,43,58911.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,288,1
2,25,38043.0,False,True,True,False,False,False,False,False,...,False,False,False,False,True,False,True,False,254,0
3,35,2630.0,False,True,True,False,False,False,False,False,...,False,False,False,True,False,False,False,True,76,0
4,36,31951.0,False,True,True,False,False,False,False,False,...,False,False,False,True,False,False,True,False,294,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115042,46,23178.0,False,True,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,274,0
115043,50,40512.0,False,True,False,True,False,False,False,False,...,False,False,False,True,False,False,False,True,146,1
115044,23,24953.0,False,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,True,210,0
115045,25,2630.0,False,True,True,False,False,False,False,False,...,False,False,False,False,True,False,True,False,193,0


In [13]:
# SETUP
exp = setup(pycaretData, target='Response',numeric_features=['Age', 'Annual_Premium'], normalize=True)
#데이터 : data
#타겟 : class 열
#numeric_features : d_ct, ip_ct 열을 숫자형 열로 인식 시키기 위해 설정 (안할시 카테고리 열로 인식함)


Unnamed: 0,Description,Value
0,Session id,4923
1,Target,Response
2,Target type,Binary
3,Original data shape,"(115047, 223)"
4,Transformed data shape,"(115047, 223)"
5,Transformed train set shape,"(80532, 223)"
6,Transformed test set shape,"(34515, 223)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple
