In [1]:
%matplotlib inline 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

# Data 

데이터 출처 : https://www.kaggle.com/avikasliwal/used-cars-price-prediction 

< y > 
* **Price** : The price of the used car in INR Lakhs.



< X > 
* Name : The brand and model of the car
* Location : The location in which the car is being sold or is available for purchase.
* Year : The year or edition of the model.
* Kilometers_Driven : The total kilometres driven in the car by the previous owner(s) in KM.
* Fuel_Type : The type of fuel used by the car. (Petrol, Diesel, Electric, CNG, LPG)
* Transmission : The type of transmission used by the car. (Automatic / Manual)
* Owner_Type : Whether the ownership is Firsthand, Second hand or other.
* Mileage : The standard mileage offered by the car company in kmpl or km/kg
* Engine : The displacement volume of the engine in CC.
* Power : The maximum power of the engine in bhp.
* Seats : The number of seats in the car.
* New_Price : The price of a new car of the same model.

In [2]:
# Load Data 
data = pd.read_csv("assignment2_data.csv", index_col=0)
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [3]:
data.shape

(6019, 13)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  New_Price          824 non-null    object 
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(9)
memory usage: 658.3+ KB


In [5]:
data.describe(include='all')

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
count,6019,6019,6019.0,6019.0,6019,6019,6019,6017,5983,5983,5977.0,824,6019.0
unique,1876,11,,,5,2,4,442,146,372,,540,
top,Mahindra XUV500 W8 2WD,Mumbai,,,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,,95.13 Lakh,
freq,49,790,,,3205,4299,4929,172,606,235,,6,
mean,,,2013.358199,58738.38,,,,,,,5.278735,,9.479468
std,,,3.269742,91268.84,,,,,,,0.80884,,11.187917
min,,,1998.0,171.0,,,,,,,0.0,,0.44
25%,,,2011.0,34000.0,,,,,,,5.0,,3.5
50%,,,2014.0,53000.0,,,,,,,5.0,,5.64
75%,,,2016.0,73000.0,,,,,,,5.0,,9.95


In [6]:
# 결측치 개수 확인
data.isna().sum()

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  42
New_Price            5195
Price                   0
dtype: int64

In [7]:
# 결측치가 너무 많은 열 삭제
data.drop('New_Price', inplace = True, axis=1)

# 결측치 행 제거 
data = data.dropna()

In [8]:
# 불필요 열 삭제 -> 카테고리 10개 이상
data = data.drop(columns=['Location'])

In [9]:
# 변수 타입 변경
data['Mileage'] = data['Mileage'].apply(lambda x: str(x).split()[0])
data['Engine'] = data['Engine'].apply(lambda x: str(x).split()[0])
data['Power'] = data['Power'].apply(lambda x: str(x).split()[0])

data = data.iloc[(np.where(data['Power']!='null'))]
data[['Mileage','Engine','Power']] = data[['Mileage','Engine','Power']].astype('float')

In [10]:
# 회사 이름만 사용
data['Name'] = data["Name"].str.split().str[0]

In [11]:
# one-hot encoding
data = pd.get_dummies(data, columns = ['Name','Fuel_Type', 'Transmission', 'Owner_Type', 'Seats'], drop_first=True)
data.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Price,Name_Audi,Name_BMW,Name_Bentley,Name_Chevrolet,...,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third,Seats_4.0,Seats_5.0,Seats_6.0,Seats_7.0,Seats_8.0,Seats_9.0,Seats_10.0
0,2010,72000,26.6,998.0,58.16,1.75,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2015,41000,19.67,1582.0,126.2,12.5,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2011,46000,18.2,1199.0,88.7,4.5,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2012,87000,20.77,1248.0,88.76,6.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2013,40670,15.2,1968.0,140.8,17.74,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

y = data["Price"]
X = data.drop(['Price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

pred = model.predict(X_train)
pred_test = model.predict(X_test)

In [13]:
print('intercept:', model.intercept_)
print('회귀계수:',model.coef_)

intercept: -2096.4375616600323
회귀계수: [ 1.05190187e+00  3.02037014e-06 -1.22863730e-01  1.69645225e-03
  7.86135857e-02  3.86514001e+00  3.02402510e+00  6.82313817e+00
 -5.92530267e+00 -7.23737547e+00 -5.79264139e+00 -9.38664976e+00
 -4.77041431e+00 -6.17062113e+00 -5.24893502e+00 -9.42906996e+00
 -6.39019704e+00  1.07148467e+01 -4.06512955e+00  4.84204449e+01
  1.69631104e+01 -9.13337830e+00 -4.17375547e+00  5.62591385e+00
  7.66380570e+00 -5.79723431e+00 -5.68309151e+00  1.78169912e+01
 -5.82263761e+00 -5.64323661e+00 -7.01581875e+00 -5.07167450e+00
 -6.03933239e+00 -2.12092026e+00  1.20247436e+00  3.10246978e+00
  1.18369471e-01  2.79795026e-02  2.87354808e-01 -6.33824596e-01
  4.38838246e-01 -1.42774652e+01 -1.88495835e+01 -1.68588696e+01
 -1.72804887e+01 -1.73743218e+01 -1.46001029e+01 -1.16567640e+01]


In [14]:
print('train MSE: ',mean_squared_error(y_train, pred))
print('test MSE : ',mean_squared_error(y_test, pred_test))

train MSE:  27.813897108455045
test MSE :  29.571340743009955
