In [49]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [50]:
train = pd.read_csv('./train_df_errno.csv')
test = pd.read_csv('./test_df.csv')
sub = pd.read_csv('./sample_submission.csv')
age = pd.read_csv('./age_gender_info.csv')

In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2896 entries, 0 to 2895
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   단지코드        2896 non-null   object 
 1   총세대수        2896 non-null   int64  
 2   임대건물구분      2896 non-null   object 
 3   지역          2896 non-null   object 
 4   공급유형        2896 non-null   object 
 5   전용면적        2896 non-null   float64
 6   전용면적별세대수    2896 non-null   int64  
 7   공가수         2896 non-null   float64
 8   자격유형        2896 non-null   object 
 9   임대보증금       2327 non-null   object 
 10  임대료         2327 non-null   object 
 11  10분내지하철수    2685 non-null   float64
 12  10분내버스정류장수  2892 non-null   float64
 13  단지내주차면수     2896 non-null   float64
 14  등록차량수       2896 non-null   float64
dtypes: float64(6), int64(2), object(7)
memory usage: 339.5+ KB


In [52]:
train.columns, test.columns

(Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수', '등록차량수'],
       dtype='object'),
 Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
        '자격유형', '임대보증금', '임대료', '10분내지하철수', '10분내버스정류장수', '단지내주차면수'],
       dtype='object'))

In [53]:
train.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수
0,C2515,545,아파트,경상남도,국민임대,33.48,276,17.0,A,9216000,82940,0.0,3.0,624.0,205.0
1,C2515,545,아파트,경상남도,국민임대,39.6,60,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
2,C2515,545,아파트,경상남도,국민임대,39.6,20,17.0,A,12672000,107130,0.0,3.0,624.0,205.0
3,C2515,545,아파트,경상남도,국민임대,46.9,38,17.0,A,18433000,149760,0.0,3.0,624.0,205.0
4,C2515,545,아파트,경상남도,국민임대,46.9,19,17.0,A,18433000,149760,0.0,3.0,624.0,205.0


### 단순선형회귀 모델

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [74]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수','단지내주차면수']
X = train[sel]
y = train['등록차량수']

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0)

### 모델 만들기

In [75]:
model = LinearRegression()   #모델 생성
model.fit(X_train,y_train)   #모델 훈련
pred = model.predict(X_test) # 새로운 데이터로 예측


### 체크

In [76]:
model.coef_ , model.intercept_

(array([-0.15912372,  0.48962892,  0.0719852 , -6.98404745,  1.09596934]),
 100.64601235358606)

### 모델 평가하기

In [77]:
import numpy as np

### MAE

In [103]:
### MAE , MSE, RMSE
mae_val = np.sum(abs(y_test-pred)) / len(pred)
mae_val

149.1291057839242

### MSE

In [104]:
mas_val = np.sum((y_test-pred)**2)/len(pred)
print(mae_val)
mse_val = np.mean((y_test-pred)**2)
print(mae_val)

149.1291057839242
149.1291057839242


### RMSE

In [105]:
rmse_val = np.sqrt(mse_val)
print(rmse_val)
rmse_val = mse_val **0.5
print(rmse_val)

206.64377137848416
206.64377137848416


### 피처수 늘리기

In [106]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [119]:
sel = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '단지내주차면수']
X = train[sel]
y = train['등록차량수']
nor_X = MinMaxScaler().fit_transform(X)  # 입력 데이터 정규화
ex_X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(nor_X)  # 데이터 feature 추가 생성
X_train, X_test, y_train, y_test = train_test_split(ex_X, y, random_state=0)

In [120]:
X_train.shape

(2172, 20)

### LASSO 모델, Ridge 모델 적용하기

In [121]:
from sklearn.linear_model import Lasso, Ridge

In [125]:
model = Lasso(alpha = 0.10)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred[0:10]


array([1977.68049941,  374.04118914,  800.65358773,  495.32293407,
        119.88243328,  727.04378172,  443.27929945,  296.52617212,
        657.31517429,  432.00344919])

### 평가하기(mae, mse, rmse)

In [126]:
mae_val = np.mean(abs(y_test - pred))
print(mae_val)
mse_val = np.mean((y_test-pred)**2)
print(mse_val)
rmse_val = mse_val**0.5
print(rmse_val)

132.33255491876386
35621.82271663757
188.73744386485043


In [117]:
model = Ridge(alpha = 0.01)
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred[0:10]

array([1716.26351148,  348.61048344,  783.88142005,  538.87547127,
        107.12306215,  717.12036301,  432.67696154,  276.12112914,
        644.72978342,  439.68033215])