In [1]:
# import from

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import folium
import warnings
warnings.filterwarnings("ignore")

# 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

### RANDOM FOREST

In [5]:
csv = pd.read_csv('../Data/iris.csv')
csv

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
csv_data = csv[['SepalLength','SepalWidth','PetalLength','PetalWidth']]
csv_label = csv.Name


In [12]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_label = train_test_split(csv_data, csv_label, stratify=csv_label)

### RANDOM FOREST

In [13]:
from sklearn .ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(train_data, train_label)
clf.score(test_data, test_label)

0.9736842105263158

---
### 독버섯과 관련된 데이터를 사용한 ML

In [22]:
mushroom = pd.read_csv('../Data/mushroom.csv', header=None)
mushroom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       8124 non-null   object
 1   1       8124 non-null   object
 2   2       8124 non-null   object
 3   3       8124 non-null   object
 4   4       8124 non-null   object
 5   5       8124 non-null   object
 6   6       8124 non-null   object
 7   7       8124 non-null   object
 8   8       8124 non-null   object
 9   9       8124 non-null   object
 10  10      8124 non-null   object
 11  11      8124 non-null   object
 12  12      8124 non-null   object
 13  13      8124 non-null   object
 14  14      8124 non-null   object
 15  15      8124 non-null   object
 16  16      8124 non-null   object
 17  17      8124 non-null   object
 18  18      8124 non-null   object
 19  19      8124 non-null   object
 20  20      8124 non-null   object
 21  21      8124 non-null   object
 22  22      8124 non-null   

In [21]:
# data 획득하기
import urllib.request as req
local = "../Data/mushroom.csv"
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
req.urlretrieve(url,local)
print('ok')

ok


In [25]:
# 문자를 숫자(ASCII Code)로 변경 / 반대는 chr()
print(ord('x'))
print(chr(120))

120
x


In [26]:
# 데이터 내부의 기호를 숫자로 변환하기
label = []
data = []

for row_index, row in mushroom.iterrows():
    label.append(row.loc[0])
    row_data = []
    for v in row.loc[1:]:
        row_data.append(ord(v))
    data.append(row_data)

print(data[0:20])

[[120, 115, 110, 116, 112, 102, 99, 110, 107, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117], [120, 115, 121, 116, 97, 102, 99, 98, 107, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 110, 103], [98, 115, 119, 116, 108, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 110, 109], [120, 121, 119, 116, 112, 102, 99, 110, 110, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 115, 117], [120, 115, 103, 102, 110, 102, 119, 98, 107, 116, 101, 115, 115, 119, 119, 112, 119, 111, 101, 110, 97, 103], [120, 121, 121, 116, 97, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 107, 110, 103], [98, 115, 119, 116, 97, 102, 99, 98, 103, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 107, 110, 109], [98, 121, 119, 116, 108, 102, 99, 98, 110, 101, 99, 115, 115, 119, 119, 112, 119, 111, 112, 110, 115, 109], [120, 121, 119, 116, 112, 102, 99, 110, 112, 101, 101, 115, 115, 119, 119, 112, 119, 111, 112, 107, 118, 103], [98, 115, 121, 11

In [27]:
# label 을 DataFrame으로 만들기
labelTemp = pd.DataFrame(label)
#labelTemp.head()

Unnamed: 0,0
0,p
1,e
2,e
3,p
4,e


In [29]:
dataTemp = pd.DataFrame(data)
#dataTemp.head() # 위에꺼에 붙일꺼라 column 시작을 1로 바꿀꺼임
dataTemp.rename(columns=lambda x: x+1, inplace=True)
dataTemp.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,120,115,110,116,112,102,99,110,107,101,...,115,119,119,112,119,111,112,107,115,117
1,120,115,121,116,97,102,99,98,107,101,...,115,119,119,112,119,111,112,110,110,103
2,98,115,119,116,108,102,99,98,110,101,...,115,119,119,112,119,111,112,110,110,109
3,120,121,119,116,112,102,99,110,110,101,...,115,119,119,112,119,111,112,107,115,117
4,120,115,103,102,110,102,119,98,107,116,...,115,119,119,112,119,111,101,110,97,103


In [30]:
# 2개의 DataFrame 합치기
mr = pd.concat([labelTemp, dataTemp], axis='columns')
mr.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,120,115,110,116,112,102,99,110,107,...,115,119,119,112,119,111,112,107,115,117
1,e,120,115,121,116,97,102,99,98,107,...,115,119,119,112,119,111,112,110,110,103
2,e,98,115,119,116,108,102,99,98,110,...,115,119,119,112,119,111,112,110,110,109
3,p,120,121,119,116,112,102,99,110,110,...,115,119,119,112,119,111,112,107,115,117
4,e,120,115,103,102,110,102,119,98,107,...,115,119,119,112,119,111,101,110,97,103


In [35]:
train_data, test_data, train_label, test_label = train_test_split(mr.loc[:,1:], mr.loc[:,0], stratify=mr.loc[:,0])
# train_data, test_data, train_label, test_label = train_test_split(dataTemp, labelTemp, stratify=labelTemp) # 위에꺼랑 같은거임 
clf = RandomForestClassifier()
clf.fit(train_data, train_label)
clf.score(test_data, test_label)

1.0

숫자로 바꿔서 해 봤으나 숫자 data들자체에 의미가 없어서??? one-hot 을 사용했었음

---
# One-Hot Encoding

In [36]:
mr[1].unique()

array([120,  98, 115, 102, 107,  99])

In [37]:
# 우선 dataTemp 로 연습
pd.get_dummies(data=dataTemp, columns=[1], prefix='1') # prefix column 생성할때 앞에 넣을 값, 밑에 보면 1번 컬럼에 있떤 값들이 1_mr[1] 으로 나옴 

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,19,20,21,22,1_98,1_99,1_102,1_107,1_115,1_120
0,115,110,116,112,102,99,110,107,101,101,...,112,107,115,117,0,0,0,0,0,1
1,115,121,116,97,102,99,98,107,101,99,...,112,110,110,103,0,0,0,0,0,1
2,115,119,116,108,102,99,98,110,101,99,...,112,110,110,109,1,0,0,0,0,0
3,121,119,116,112,102,99,110,110,101,101,...,112,107,115,117,0,0,0,0,0,1
4,115,103,102,110,102,119,98,107,116,101,...,101,110,97,103,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,115,110,102,110,97,99,98,121,101,63,...,112,98,99,108,0,0,0,1,0,0
8120,115,110,102,110,97,99,98,121,101,63,...,112,98,118,108,0,0,0,0,0,1
8121,115,110,102,110,97,99,98,110,101,63,...,112,98,99,108,0,0,1,0,0,0
8122,121,110,102,121,102,99,110,98,116,63,...,101,119,118,108,0,0,0,1,0,0


In [41]:
# 적용
len(mr.columns) # = 23
for i in range(1,len(mr.columns)): # < 0은 Target 이기 때문에 1번부터
    dataTemp = pd.get_dummies(data=dataTemp, columns=[i], prefix=str(i)) # prefix는 문자로만 적어야댐
dataTemp.head()

Unnamed: 0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,2_121,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [42]:
mr2 = pd.concat([labelTemp, dataTemp], axis='columns')
mr2.head()

Unnamed: 0,0,1_98,1_99,1_102,1_107,1_115,1_120,2_102,2_103,2_115,...,21_115,21_118,21_121,22_100,22_103,22_108,22_109,22_112,22_117,22_119
0,p,0,0,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,1,0
1,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,e,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,p,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,e,0,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [44]:
train_data, test_data, train_label, test_label = train_test_split(mr2.iloc[:,1:], mr2.iloc[:,0], stratify=mr2.iloc[:,0])
# train_data, test_data, train_label, test_label = train_test_split(dataTemp, labelTemp, stratify=labelTemp) # 위에꺼랑 같은거임 
clf = RandomForestClassifier()
clf.fit(train_data, train_label)
clf.score(test_data, test_label)

1.0