## Section 2.2 : Scikit_learn 샘플 Dataset예시

- 샘플 Dataset 불러오기

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

from sklearn.datasets import fetch_california_housing
import pandas as pd

In [2]:
ch = fetch_california_housing()

In [3]:
x = pd.DataFrame(ch.data, columns=ch.feature_names)
y = pd.DataFrame(ch.target, columns=["Avelnc"])
data = pd.concat([x,y], axis = 1)

In [4]:
data.tail()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Avelnc
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24,0.894


In [5]:
data.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Avelnc
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


## Section 2.3.1 : 스케일링(scaling)

- Scikit-learn에서 제공하는 scale(x), robust_scale(x), minmax_scale(x), maxabs_scale(x)

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale, robust_scale, minmax_scale, maxabs_scale

In [7]:
x = (np.arange(5, dtype = np.float)).reshape(-1,1)
#Scaling
test = pd.DataFrame(np.hstack([x, scale(x),robust_scale(x), minmax_scale(x), maxabs_scale(x)]),
                   columns=["x", "scale(x)", "robust_scale(x)", "minmax_scale(x)", "maxabs_scale(x)"])
test

Unnamed: 0,x,scale(x),robust_scale(x),minmax_scale(x),maxabs_scale(x)
0,0.0,-1.414214,-1.0,0.0,0.0
1,1.0,-0.707107,-0.5,0.25,0.25
2,2.0,0.0,0.0,0.5,0.5
3,3.0,0.707107,0.5,0.75,0.75
4,4.0,1.414214,1.0,1.0,1.0


## Section 2.3.2 인코딩(Encoding)

- Label Encoding 실습, Label Encoding은 실제 값에 관계 없이 0~K-1까지의 정수로 변환

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(["seoul", "busan", "busan", "daejeon"])
print(le.classes_)

['busan' 'daejeon' 'seoul']


In [9]:
print(le.transform(["seoul", "busan", "busan", "daejeon"]))
print(list(le.inverse_transform([2,0,1])))

[2 0 0 1]
['seoul', 'busan', 'daejeon']


## Section 2.3.3 : 결측값 처리(Imputation)

- Imputer를 통해 누락된 정보(결측값)를 채운다.

In [10]:
import numpy as np
from sklearn.preprocessing import Imputer

In [11]:
#imputer - mean
imp_mean = Imputer(missing_values="NaN", strategy = 'mean', axis = 0)
print(imp_mean.fit_transform([[1,5], [2,np.nan],[3,3]]))

[[1. 5.]
 [2. 4.]
 [3. 3.]]




In [12]:
#imputer - median
imp_mean = Imputer(missing_values="NaN", strategy = 'median', axis = 0)
print(imp_mean.fit_transform([[1,5], [2,np.nan],[3,3]]))

[[1. 5.]
 [2. 4.]
 [3. 3.]]




In [13]:
#imputer - most_
imp_mean = Imputer(missing_values="NaN", strategy = 'mean', axis = 0)
print(imp_mean.fit_transform([[1,5], [2,np.nan],[3,3]]))

[[1. 5.]
 [2. 4.]
 [3. 3.]]




## Section 3 : Scikit-learn을 활용한 Wine Quality Data set Nerual Network 실습

In [15]:
import pandas as pd
train = pd.read_csv("./Winequality_Train.csv")
test = pd.read_csv("./Winequality_Test.csv")

In [16]:
train_x = train.iloc[:, :-1]
train_y = train.iloc[:, -1]

test_x = test.iloc[:, :-1]
test_y = test.iloc[:, -1]

In [17]:
from sklearn.neural_network import MLPClassifier

In [18]:
mlp = MLPClassifier(hidden_layer_sizes=(50,30))
mlp.fit(train_x, train_y)
print("Training Score:%s"%mlp.score(train_x,train_y))

Training Score:0.9791071035847811


In [20]:
pred = mlp.predict(test_x)
confusion_matrix = pd.crosstab(test_y, pred, rownames=['True'],
                              colnames=['Predicted'], margins = True)
confusion_matrix

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1454,18,1472
1,13,465,478
All,1467,483,1950


# Section 4.1 : Breast-Cancer-Wisonsin 데이터 설명

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
df = pd.read_csv('breast-cancer-wisconsin.data', header=None,
names=['ID', 'CIump Thickness', 'Uniformity of CeII Size',
'Uniformity of CeII Shape', 'MarginaI Adhesion',
'Single EpitheIiaI CeII Size', 'Bare NucIei',
'BIand Chromatin', 'NormaI NucIeoIi', 'Mitoses','Class'])
pd.set_option('display.max_rows', 10)
print(df)

          ID  CIump Thickness  Uniformity of CeII Size  \
0    1000025                5                        1   
1    1002945                5                        4   
2    1015425                3                        1   
3    1016277                6                        8   
4    1017023                4                        1   
..       ...              ...                      ...   
694   776715                3                        1   
695   841769                2                        1   
696   888820                5                       10   
697   897471                4                        8   
698   897471                4                        8   

     Uniformity of CeII Shape  MarginaI Adhesion  Single EpitheIiaI CeII Size  \
0                           1                  1                            2   
1                           4                  5                            7   
2                           1                  1            

In [29]:
df = df.drop("ID", axis = 1)
df['Class'] = df['Class'].replace(2,0)
df['Class'] = df['Class'].replace(4,1)
df.replace("?", np.nan, inplace = True)
df.dropna(inplace = True)

In [30]:
y = df['Class']
x = df.drop("Class", axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.3, random_state=0)

In [31]:
forest = RandomForestClassifier(n_estimators=1000, random_state=0)
forest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [32]:
pred = forest.predict(x_test)
confusion_matrix = pd.crosstab(y_test, pred, rownames=['True'],
                              colnames=['Predicted'], margins = True)
confusion_matrix

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,126,4,130
1,5,70,75
All,131,74,205


In [79]:
import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing, tree
col = ["age", "workclass", "fnlwgt", "education" , "education-num"
"marital-status" , 'occupation', "relationship", "race", "sex",
"capital-gain", "capital-loss", "hours-per-week", "native-country",
"income-level"]

##정의 한 열 이름을 사용하여 데이터 읽기
df_train = pd.read_csv("adult.data.txt", names = col)
df_test = pd.read_csv("adult.test.txt", names = col,
                     skiprows=1)
print(len(df_train))
print(len(df_test))

32561
16281


In [80]:
### 데이터 전처리
df_test['income-level'] = df_test['income-level'].str.replace('.','')

###카테고리형 변수 데이터를 숫자로 변경하기(0,1,2,3)
categorical_col = df_train.dtypes[df_train.dtypes == 'object'].index

LabelEncoder = preprocessing.LabelEncoder()

for i in categorical_col:
    if df_train[i].dtypes == 'object':
        category = df_train[i].append(df_test[i])
        LabelEncoder.fit(category.values)
        df_train[i] = LabelEncoder.transform(df_train[i])
        df_test[i] = LabelEncoder.transform(df_test[i])
        
df_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-nummarital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income-level
39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0


## Section 1.4 : 트레이닝셋과 테스트셋 분리

In [83]:
### 트레이닝셋과 테스트셋의 income-level(타겟변수) 분리하기
x_test = np.array(df_test.drop(['income-level'], 1))
y_test = np.array(df_test['income-level'])
x = np.array(df_train.drop(['income-level'], 1))
y = np.array(df_train['income-level'])

## Section 1.5 : 모델링

In [84]:
# DecisionTree 모델
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(x,y)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [87]:
### 모델 평가
pred = dt.predict(x_test)
confusion_matrix = pd.crosstab(y_test, pred, rownames=['True'],
                              colnames=['Predicted'], margins = True)
confusion_matrix


Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,10860,1575,12435
1,1546,2300,3846
All,12406,3875,16281


# Section 2.2) 마지막 German Credit 은 실습.

In [None]:
## 1) Data Loading 하기 (german_credit.csv)
## 2) train, test set 구분하기
## 3) Logistic Fitting하기
## 4) Decision Tree Fitting하기
## 5) Random Fortest Fitting하기
## 6) 세모델에 대한 Confusion Matrix 만들기
## 7) 6)으로 부터 세모델에 대한 Accuaracy 구하기