In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
%matplotlib inline

In [2]:
# 파일 불러오기
#예측하고자 하는 데이터는 잡은 물고기의 수임
#예측하고자 하는 데이터는 count컬럼에 저장

df = pd.read_csv('fish.csv')

In [3]:
df.head()

Unnamed: 0,nofish,livebait,camper,persons,child,xb,zg,count
0,1,0,0,1,0,-0.896315,3.050405,0
1,0,1,1,1,0,-0.558345,1.746149,0
2,0,1,0,1,0,-0.401731,0.279939,0
3,0,1,1,2,1,-0.956298,-0.601526,0
4,0,1,0,1,0,0.436891,0.527709,1


In [4]:
from collections import Counter
#count 컬럼에 저장된 데이터와 빈도수를 출력
#(0,142) : 0의 빈도수가 142
#(1,31) : 1의 빈도수가 31

#대부분이 0마리 즉 물고기를 잡지 못함

sorted(Counter(df["count"]).items())

[(0, 142),
 (1, 31),
 (2, 20),
 (3, 12),
 (4, 6),
 (5, 10),
 (6, 4),
 (7, 3),
 (8, 2),
 (9, 2),
 (10, 1),
 (11, 1),
 (13, 1),
 (14, 1),
 (15, 2),
 (16, 1),
 (21, 2),
 (22, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 2),
 (38, 1),
 (65, 1),
 (149, 1)]

In [5]:
#대부분의 데이터가 5미만이므로 5미만의 데이터만을 
#분석하도록 df에 대입
df=df[df['count']<=5]
df.head()

Unnamed: 0,nofish,livebait,camper,persons,child,xb,zg,count
0,1,0,0,1,0,-0.896315,3.050405,0
1,0,1,1,1,0,-0.558345,1.746149,0
2,0,1,0,1,0,-0.401731,0.279939,0
3,0,1,1,2,1,-0.956298,-0.601526,0
4,0,1,0,1,0,0.436891,0.527709,1


In [6]:
from collections import Counter

#count 컬럼에 저장된 데이터와 빈도수를 출력
#(0,142) : 0의 빈도수가 142
#(1,31) : 1의 빈도수가 31

#대부분이 0마리 즉 물고기를 잡지 못함

sorted(Counter(df["count"]).items())

[(0, 142), (1, 31), (2, 20), (3, 12), (4, 6), (5, 10)]

In [7]:
from imblearn.over_sampling import SMOTENC



In [8]:
#예측하고자 하는 데이터 count를 y에 대입
y=df["count"]
y

0      0
1      0
2      0
3      0
4      1
5      0
6      0
7      0
8      0
9      1
10     0
11     0
12     1
13     2
14     0
15     1
16     0
17     0
18     1
19     0
20     1
21     5
22     0
23     3
25     0
27     0
28     0
29     0
30     0
31     0
      ..
219    0
220    0
221    2
222    0
223    0
224    0
225    1
226    4
227    0
228    0
229    2
230    3
231    0
232    0
233    0
234    0
235    1
236    2
237    0
239    4
240    1
241    1
242    0
243    1
244    0
245    0
246    0
247    0
248    0
249    0
Name: count, Length: 221, dtype: int64

In [9]:
#df를 복사 :df.copy() 
#count컬럼을 제거하고 X에 대입
X=df.copy().drop(["count"],axis=1)
X.head()

Unnamed: 0,nofish,livebait,camper,persons,child,xb,zg
0,1,0,0,1,0,-0.896315,3.050405
1,0,1,1,1,0,-0.558345,1.746149
2,0,1,0,1,0,-0.401731,0.279939
3,0,1,1,2,1,-0.956298,-0.601526
4,0,1,0,1,0,0.436891,0.527709


In [10]:
#적은 빈도수의 데이터를 Oversampling 해서 데이터들의 빈도수를 맞춰주는 
#SMOTENC객체 생성
#oversampling 할 데이터는 0이상 5미만 : categorical_features=[0, 6]
#매번 같은 데이터가 생성되도록 random_state=0 을 설정

smote_nc = SMOTENC(categorical_features=[0, 6], random_state=0)


In [11]:
#y에 저장된 데이터를 기준으로 y의 빈도수가 적은 데이터를 Oversampling
X_resampled, y_resampled = smote_nc.fit_resample(X, y)



In [12]:
from collections import Counter

#count 컬럼에 저장된 데이터와 빈도수를 출력
#(0,142) : 0의 빈도수가 142

#모든 데이터의 빈도수가 같아짐

sorted(Counter(y_resampled).items())

[(0, 142), (1, 142), (2, 142), (3, 142), (4, 142), (5, 142)]

In [13]:
from sklearn.model_selection import train_test_split

#학습데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
                       X, y, stratify=y, test_size=0.2, random_state=42)

In [14]:
X_train.head()

Unnamed: 0,nofish,livebait,camper,persons,child,xb,zg
239,0,1,1,2,0,1.724751,0.927847
27,0,1,0,2,1,1.606172,-1.064543
114,0,1,0,3,2,0.35158,-2.184268
40,0,1,1,1,0,-0.711814,3.020478
120,0,1,0,2,1,-0.2674,-1.620429


In [15]:
X_test.head()

Unnamed: 0,nofish,livebait,camper,persons,child,xb,zg
136,0,1,1,4,1,1.963856,0.03152
165,1,1,0,1,0,0.132049,2.244067
31,0,1,1,3,1,0.745259,-0.663867
21,0,1,1,4,0,1.642112,1.892821
34,0,1,1,1,0,-1.108258,0.772088


In [16]:
y_train.head()

239    4
27     0
114    0
40     0
120    0
Name: count, dtype: int64

In [17]:
y_test.head()

136    3
165    1
31     0
21     5
34     0
Name: count, dtype: int64

In [18]:
lr = LinearRegression()
#선형 회귀 실행
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
from sklearn.metrics import mean_squared_error , mean_absolute_error
#선형 회귀 예측
y_pred = lr.predict(X_test)
#에러 
RMSE = (mean_squared_error(y_test, y_pred))**0.5
#에러의 제곱
MSE = mean_squared_error(y_test, y_pred)
#에러의 평균치
MAE = mean_absolute_error(y_test,y_pred)

print('\nRMSE : ',RMSE ,'\nMSE :',MSE,'\nMAE : ' , MAE )


RMSE :  0.9784488138849033 
MSE : 0.9573620813927742 
MAE :  0.7763191654164279
