# 선형회귀

현대 중공업과 계약을 맺어 일부 선박에 대한 예측 모델을 구축하게됐습니다. 현대 중공업은 세계 최대의 선박 제조업체 중 하나로 유람선을 제작하고 있습니다.
당신은 선박에 필요한 선원 수를 정확하게 예측할 수 있도록 울산에있는 본사에 도착했습니다.
그들은 현재 새로운 선박을 건조하고 있으며 예측 모델을 만들고, 이를 사용하여 선박에 필요한 승무원 수를 예측하기를 원합니다.

지금까지의 데이터는 다음과 같습니다.

    Description: Measurements of ship size, capacity, crew, and age for 158 cruise
    ships.


    Variables/Columns
    Ship Name     1-20
    Cruise Line   21-40
    Age (as of 2013)   46-48
    Tonnage (1000s of tons)   50-56
    passengers (100s)   58-64
    Length (100s of feet)  66-72
    Cabins  (100s)   74-80
    Passenger Density   82-88
    Crew  (100s)   90-96
    
위 데이터는 "cruise_ship_info.csv"라는 csv 파일에 저장됩니다. 귀하의 임무는 향후 선박에 필요한 선원 수를 예측하는 데 도움이되는 회귀 모델을 만드는 것입니다. 고객은 또한 특정 크루즈 라인이 허용되는 승무원 수에 차이가 있음을 발견 했으므로 분석에 포함하는 것이 가장 중요한 기능이라고 언급했습니다!

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("./data/cruise_ship_info.csv")

In [3]:
df.head()

Unnamed: 0,Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
0,Journey,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
1,Quest,Azamara,6,30.277,6.94,5.94,3.55,42.64,3.55
2,Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
3,Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
4,Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0


In [4]:
df.dtypes

Ship_name             object
Cruise_line           object
Age                    int64
Tonnage              float64
passengers           float64
length               float64
cabins               float64
passenger_density    float64
crew                 float64
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158 entries, 0 to 157
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ship_name          158 non-null    object 
 1   Cruise_line        158 non-null    object 
 2   Age                158 non-null    int64  
 3   Tonnage            158 non-null    float64
 4   passengers         158 non-null    float64
 5   length             158 non-null    float64
 6   cabins             158 non-null    float64
 7   passenger_density  158 non-null    float64
 8   crew               158 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 11.2+ KB


In [6]:
df.shape

(158, 9)

In [7]:
df.describe()

Unnamed: 0,Age,Tonnage,passengers,length,cabins,passenger_density,crew
count,158.0,158.0,158.0,158.0,158.0,158.0,158.0
mean,15.689873,71.284671,18.457405,8.130633,8.83,39.900949,7.794177
std,7.615691,37.22954,9.677095,1.793474,4.471417,8.639217,3.503487
min,4.0,2.329,0.66,2.79,0.33,17.7,0.59
25%,10.0,46.013,12.535,7.1,6.1325,34.57,5.48
50%,14.0,71.899,19.5,8.555,9.57,39.085,8.15
75%,20.0,90.7725,24.845,9.51,10.885,44.185,9.99
max,48.0,220.0,54.0,11.82,27.0,71.43,21.0


In [8]:
df2 = df[["Cruise_line", "Tonnage", "passengers", "length", "cabins", "passenger_density", "crew"]]

In [9]:
df2.head()

Unnamed: 0,Cruise_line,Tonnage,passengers,length,cabins,passenger_density,crew
0,Azamara,30.277,6.94,5.94,3.55,42.64,3.55
1,Azamara,30.277,6.94,5.94,3.55,42.64,3.55
2,Carnival,47.262,14.86,7.22,7.43,31.8,6.7
3,Carnival,110.0,29.74,9.53,14.88,36.99,19.1
4,Carnival,101.353,26.42,8.92,13.21,38.36,10.0


In [10]:
df["passengers"]/df["passenger_density"]

0      0.162758
1      0.162758
2      0.467296
3      0.804001
4      0.688738
         ...   
153    0.013038
154    0.500255
155    0.046663
156    0.052122
157    0.064341
Length: 158, dtype: float64

In [11]:
df2["passenger_area"] = df["passengers"]/df["passenger_density"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["passenger_area"] = df["passengers"]/df["passenger_density"]


In [12]:
df2.head()

Unnamed: 0,Cruise_line,Tonnage,passengers,length,cabins,passenger_density,crew,passenger_area
0,Azamara,30.277,6.94,5.94,3.55,42.64,3.55,0.162758
1,Azamara,30.277,6.94,5.94,3.55,42.64,3.55,0.162758
2,Carnival,47.262,14.86,7.22,7.43,31.8,6.7,0.467296
3,Carnival,110.0,29.74,9.53,14.88,36.99,19.1,0.804001
4,Carnival,101.353,26.42,8.92,13.21,38.36,10.0,0.688738


In [13]:
df3 = df2[["Cruise_line", "cabins", "passenger_area", "crew"]]

In [14]:
df3.head()

Unnamed: 0,Cruise_line,cabins,passenger_area,crew
0,Azamara,3.55,0.162758,3.55
1,Azamara,3.55,0.162758,3.55
2,Carnival,7.43,0.467296,6.7
3,Carnival,14.88,0.804001,19.1
4,Carnival,13.21,0.688738,10.0


In [15]:
onehot_line = pd.get_dummies(df3["Cruise_line"], prefix = "line")

In [16]:
df4 = pd.concat([df3, onehot_line], axis = 1)

In [17]:
df4.drop("Cruise_line", axis = 1, inplace = True)

In [18]:
X = df4.drop("crew", axis = 1)
Y = df4["crew"]

In [19]:
df4.head()

Unnamed: 0,cabins,passenger_area,crew,line_Azamara,line_Carnival,line_Celebrity,line_Costa,line_Crystal,line_Cunard,line_Disney,...,line_Oceania,line_Orient,line_P&O,line_Princess,line_Regent_Seven_Seas,line_Royal_Caribbean,line_Seabourn,line_Silversea,line_Star,line_Windstar
0,3.55,0.162758,3.55,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.55,0.162758,3.55,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7.43,0.467296,6.7,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,14.88,0.804001,19.1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13.21,0.688738,10.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3,
                                                    random_state = 1)

In [21]:
pf = PolynomialFeatures(degree = 1, include_bias = False)
poly_train = pf.fit_transform(x_train)
poly_test = pf.transform(x_test)
poly_train.shape

(110, 22)

In [22]:
ss = StandardScaler()
scaled_train = ss.fit_transform(poly_train)
scaled_test = ss.transform(poly_test)

In [23]:
rid = Ridge(alpha = 0.5)
rid.fit(scaled_train, y_train)
rid.score(scaled_test, y_test)

0.9597248125893679

In [24]:
pred = rid.predict(scaled_test)

In [25]:
mae = mean_squared_error(y_test, pred)

In [26]:
mae

0.2941764279105156

- 승객 밀도와 승객 데이터가 존재한다는 점으로 보아 두 독립변수의 조합으로 승객 구역의 면적을 구할 수 있다는 점에 착안하여 새로운 특성을 추출하여 모델 생성에 사용하였습니다