# 2019.11.17. 머신러닝 Regression with python3.6

In [18]:
!pip install --upgrade pandas==0.24.0

Requirement already up-to-date: pandas==0.24.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (0.24.0)


## 0. Package 가져오기

In [26]:
import pandas as pd
import numpy as np

print(pd.__version__)
print(np.__version__)

0.23.4
1.16.2


## 1.  CSV 데이터 가져오기

In [33]:
data = pd.read_csv('SR_Data.csv')
data.head(10)

Unnamed: 0,Country,Age,Year,Salary
0,France,44.0,15.0,72000
1,Spain,27.0,3.0,48000
2,Germany,30.0,2.0,54000
3,Spain,38.0,,61000
4,Germany,40.0,10.0,61000
5,France,35.0,,58000
6,Spain,,6.0,52000
7,France,48.0,,79000
8,Germany,50.0,21.0,83000
9,France,37.0,7.0,67000


In [34]:
data.shape

(10, 4)

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country    10 non-null object
Age        9 non-null float64
Year       7 non-null float64
Salary     10 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 400.0+ bytes


In [36]:
data.isnull().sum()

Country    0
Age        1
Year       3
Salary     0
dtype: int64

## 2. feature/label 나누기

In [37]:
# X = data[['Country', 'Age', 'Year']].to_numpy() 이런 방식으로는 사용하지 않음
# y = data['Salary'].to_numpy()

# X = data.iloc[:, :-1].to_numpy() # 2차원 이상 벡터는 대문자
# y = data.iloc[:, -1].to_numpy() # 1차원 벡터는 소문자

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

print(X)
print(y)

[['France' 44.0 15.0]
 ['Spain' 27.0 3.0]
 ['Germany' 30.0 2.0]
 ['Spain' 38.0 nan]
 ['Germany' 40.0 10.0]
 ['France' 35.0 nan]
 ['Spain' nan 6.0]
 ['France' 48.0 nan]
 ['Germany' 50.0 21.0]
 ['France' 37.0 7.0]]
[72000 48000 54000 61000 61000 58000 52000 79000 83000 67000]


## 3. Clean Missing Data

In [38]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# 각각 다른 컬럼을 채워주고 싶은 경우 fit, transform을 따로 시행
# imputer.fit(X[:, 1:]) # nan값을 대체할 값 학습
# X[:, 1:] = imputer.transform(X[:, 1:]) # 데이터에 nan값이 등장하면 위의 fit된 값으로 변경

# 한번에 채워주고 싶은 경우
X[:, 1:] = imputer.fit_transform(X[:, 1:]) 

print(X)

[['France' 44.0 15.0]
 ['Spain' 27.0 3.0]
 ['Germany' 30.0 2.0]
 ['Spain' 38.0 9.142857142857142]
 ['Germany' 40.0 10.0]
 ['France' 35.0 9.142857142857142]
 ['Spain' 38.77777777777778 6.0]
 ['France' 48.0 9.142857142857142]
 ['Germany' 50.0 21.0]
 ['France' 37.0 7.0]]


In [39]:
print(X[:, :2]) # 범위(, ) 포함 범위 주의! 앞은 포함 뒤는 포함 X

[['France' 44.0]
 ['Spain' 27.0]
 ['Germany' 30.0]
 ['Spain' 38.0]
 ['Germany' 40.0]
 ['France' 35.0]
 ['Spain' 38.77777777777778]
 ['France' 48.0]
 ['Germany' 50.0]
 ['France' 37.0]]


In [40]:
# 도움말 보기
?imputer

In [43]:
pd.DataFrame(X).isnull().sum()

0    0
1    0
2    0
dtype: int64

## 4. Make Categorical

In [45]:
# 버전 에러를 없애고 싶을 때 아래 Label인코딩, onehot인코딩 대신 이것만 시행
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [0])], remainder='passthrough')
XX = ct.fit_transform(X)

print(XX) 

[[1.0 0.0 0.0 44.0 15.0]
 [0.0 0.0 1.0 27.0 3.0]
 [0.0 1.0 0.0 30.0 2.0]
 [0.0 0.0 1.0 38.0 9.142857142857142]
 [0.0 1.0 0.0 40.0 10.0]
 [1.0 0.0 0.0 35.0 9.142857142857142]
 [0.0 0.0 1.0 38.77777777777778 6.0]
 [1.0 0.0 0.0 48.0 9.142857142857142]
 [0.0 1.0 0.0 50.0 21.0]
 [1.0 0.0 0.0 37.0 7.0]]


In [46]:
# 문자 변수를 숫자 변수로 변환
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()
X[:, 0] = labelEncoder.fit_transform(X[:, 0])

print(X)

[[0 44.0 15.0]
 [2 27.0 3.0]
 [1 30.0 2.0]
 [2 38.0 9.142857142857142]
 [1 40.0 10.0]
 [0 35.0 9.142857142857142]
 [2 38.77777777777778 6.0]
 [0 48.0 9.142857142857142]
 [1 50.0 21.0]
 [0 37.0 7.0]]


In [47]:
# 범주 변수를 가변수화하기
from sklearn.preprocessing import OneHotEncoder

onehotEncoder = OneHotEncoder(categorical_features=[0])
X = onehotEncoder.fit_transform(X).toarray()

print(X)

[[ 1.          0.          0.         44.         15.        ]
 [ 0.          0.          1.         27.          3.        ]
 [ 0.          1.          0.         30.          2.        ]
 [ 0.          0.          1.         38.          9.14285714]
 [ 0.          1.          0.         40.         10.        ]
 [ 1.          0.          0.         35.          9.14285714]
 [ 0.          0.          1.         38.77777778  6.        ]
 [ 1.          0.          0.         48.          9.14285714]
 [ 0.          1.          0.         50.         21.        ]
 [ 1.          0.          0.         37.          7.        ]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


## 5. Split Train/Test Set

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

print(X_train)

[[ 1.          0.          0.         48.          9.14285714]
 [ 0.          1.          0.         30.          2.        ]
 [ 0.          0.          1.         27.          3.        ]
 [ 1.          0.          0.         37.          7.        ]
 [ 0.          0.          1.         38.          9.14285714]
 [ 1.          0.          0.         44.         15.        ]
 [ 0.          0.          1.         38.77777778  6.        ]
 [ 1.          0.          0.         35.          9.14285714]]


## 6. Standardization

In [49]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train[:, 3:] = scaler.fit_transform(X_train[:, 3:])
X_test[:, 3:] = scaler.fit_transform(X_test[:, 3:])

print(X_train)

[[1.         0.         0.         1.         0.54945055]
 [0.         1.         0.         0.14285714 0.        ]
 [0.         0.         1.         0.         0.07692308]
 [1.         0.         0.         0.47619048 0.38461538]
 [0.         0.         1.         0.52380952 0.54945055]
 [1.         0.         0.         0.80952381 1.        ]
 [0.         0.         1.         0.56084656 0.30769231]
 [1.         0.         0.         0.38095238 0.54945055]]


## 7. Train

### 1) Linear Regression

In [50]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train) # 학습

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [51]:
?LinearRegression # 속성의 default값 살펴보기

Object `LinearRegression # 속성의 default값 살펴보기` not found.


### 2) Decision Tree

In [52]:
from sklearn.tree import DecisionTreeRegressor

regressor_tree = DecisionTreeRegressor()
regressor_tree.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## 8. Predict(Scoring)

In [53]:
y_pred = regressor.predict(X_test)
y_pred_tree = regressor_tree.predict(X_test)

print(y_test)
print(y_pred)
print(y_pred_tree)

[83000 61000]
[72963.82803215 50662.6622963 ]
[79000. 48000.]


## 9. Evaluate

In [54]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

print("MAE : ", mean_absolute_error(y_test, y_pred))
print("MAE_tree : ", mean_absolute_error(y_test, y_pred_tree))

print("MSE : ", mean_squared_error(y_test, y_pred))
print("MSE_tree : ", mean_squared_error(y_test, y_pred_tree))

MAE :  10186.754835774522
MAE_tree :  8500.0
MSE :  103792649.28428817
MSE_tree :  92500000.0


In [56]:
# 사이킷런의 모든 알고리즘 살펴보기
from sklearn.utils.testing import all_estimators
estimators = all_estimators()

print(estimators)

[('ARDRegression', <class 'sklearn.linear_model.bayes.ARDRegression'>), ('AdaBoostClassifier', <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>), ('AdaBoostRegressor', <class 'sklearn.ensemble.weight_boosting.AdaBoostRegressor'>), ('AdditiveChi2Sampler', <class 'sklearn.kernel_approximation.AdditiveChi2Sampler'>), ('AffinityPropagation', <class 'sklearn.cluster.affinity_propagation_.AffinityPropagation'>), ('AgglomerativeClustering', <class 'sklearn.cluster.hierarchical.AgglomerativeClustering'>), ('BaggingClassifier', <class 'sklearn.ensemble.bagging.BaggingClassifier'>), ('BaggingRegressor', <class 'sklearn.ensemble.bagging.BaggingRegressor'>), ('BayesianGaussianMixture', <class 'sklearn.mixture.bayesian_mixture.BayesianGaussianMixture'>), ('BayesianRidge', <class 'sklearn.linear_model.bayes.BayesianRidge'>), ('BernoulliNB', <class 'sklearn.naive_bayes.BernoulliNB'>), ('BernoulliRBM', <class 'sklearn.neural_network.rbm.BernoulliRBM'>), ('Binarizer', <class 'sklearn.prepr

In [57]:
print(estimators.count)

<built-in method count of list object at 0x7ff16413bcc8>
