## 라이브러리

In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

## 데이터 불러오기

In [44]:
X_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_train.csv")
X_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/X_test.csv")
y_test= pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/carsprice/y_test.csv')

display(X_train.head())
display(y_train.head())

Unnamed: 0,carID,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,13207,hyundi,Santa Fe,2019,Semi-Auto,4223,Diesel,145.0,39.8,2.2
1,17314,vauxhall,GTC,2015,Manual,47870,Diesel,125.0,60.1,2.0
2,12342,audi,RS4,2019,Automatic,5151,Petrol,145.0,29.1,2.9
3,13426,vw,Scirocco,2016,Automatic,20423,Diesel,30.0,57.6,2.0
4,16004,skoda,Scala,2020,Semi-Auto,3569,Petrol,145.0,47.1,1.0


Unnamed: 0,carID,price
0,13207,31995
1,17314,7700
2,12342,58990
3,13426,12999
4,16004,16990


## 1. 데이터셋 확인, EDA

### shape

In [10]:
print(X_train.shape)
print('-'*80)
print(y_train.shape)
print('-'*80)
print(X_test.shape)
print('-'*80)

(4960, 10)
--------------------------------------------------------------------------------
(4960, 2)
--------------------------------------------------------------------------------
(2672, 10)
--------------------------------------------------------------------------------


### info

In [11]:
print(X_train.info())
print('-'*80)
print(y_train.info())
print('-'*80)
print(X_test.info())
print('-'*80)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carID         4960 non-null   int64  
 1   brand         4960 non-null   object 
 2   model         4960 non-null   object 
 3   year          4960 non-null   int64  
 4   transmission  4960 non-null   object 
 5   mileage       4960 non-null   int64  
 6   fuelType      4960 non-null   object 
 7   tax           4960 non-null   float64
 8   mpg           4960 non-null   float64
 9   engineSize    4960 non-null   float64
dtypes: float64(3), int64(3), object(4)
memory usage: 387.6+ KB
None
--------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4960 entries, 0 to 4959
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   carID   4960 non-null   int64
 1   price   496

### isna

In [12]:
print(X_train.isna().sum().sum())
print('-'*80)
print(y_train.isna().sum().sum())
print('-'*80)
print(X_test.isna().sum().sum())
print('-'*80)

0
--------------------------------------------------------------------------------
0
--------------------------------------------------------------------------------
0
--------------------------------------------------------------------------------


### 불필요 컬럼 제거

In [50]:
X_train = X_train.drop(columns='carID')
X_test = X_test.drop(columns='carID')

In [54]:
X_test

Unnamed: 0,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,merc,GLS Class,2017,Automatic,12046,Diesel,150.0,37.2,3.0
1,vw,Amarok,2017,Automatic,37683,Diesel,260.0,36.2,3.0
2,merc,GLS Class,2019,Automatic,10000,Diesel,145.0,34.0,3.0
3,skoda,Scala,2019,Manual,3257,Petrol,145.0,49.6,1.0
4,audi,RS6,2015,Semi-Auto,20982,Petrol,325.0,29.4,4.0
...,...,...,...,...,...,...,...,...,...
2667,audi,A7,2015,Semi-Auto,21100,Petrol,325.0,29.7,4.0
2668,merc,CLS Class,2015,Automatic,60972,Diesel,160.0,52.3,3.0
2669,ford,Puma,2020,Manual,4111,Petrol,145.0,50.4,1.0
2670,merc,CLA Class,2016,Automatic,25726,Petrol,200.0,41.5,2.0


## 2. Feature Engineering

### 표준화(스케일링)

In [55]:
Scaler = MinMaxScaler()
X_train[['mileage', 'tax','mpg']] = Scaler.fit_transform(X_train[['mileage', 'tax','mpg']])
X_test[['mileage', 'tax','mpg']] = Scaler.fit_transform(X_test[['mileage', 'tax','mpg']])

### 원-핫 인코딩(pd.get_dummies)

In [56]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

### 데이터 분류(train_test_split)

In [57]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X_train, y_train, test_size = 0.4)

## 3. 모델링

### 랜덤포레스트

In [58]:
modelRF = RandomForestRegressor()
modelRF.fit(X_tr, y_tr)
modelRF.score(X_tr, y_tr)

  modelRF.fit(X_tr, y_tr)


0.9920585847033642

## 4. 모델 학습 및 예측, 평가

In [59]:
from sklearn.metrics import r2_score
y_pred = modelRF.predict(X_ts)
print('RF r2_score', r2_score(y_ts, y_pred))

RF r2_score 0.9205716180953991
