In [1]:
# 간단하게 knn 구현에 대해서 알아보자
# bmi 예제(multinomial classification)을 대상으로 knn의 결과와 
# logistic regression의 결과를 비교

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# raw data loading
df = pd.read_csv('./data/bmi.csv')
display(df.head())

Unnamed: 0,label,height,weight
0,1,188,71
1,2,161,68
2,0,178,52
3,2,136,63
4,1,145,52


In [3]:
# data  split
train_x_data,test_x_data,train_t_data,test_t_data = \
train_test_split(df[['height','weight']],df['label'],
                test_size=0.3, random_state=1, stratify=df['label'])

In [4]:
# 결측치는 없고, 이상치도 없음
# 정규화
scaler = MinMaxScaler()
scaler.fit(train_x_data)
norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

In [5]:
# logistic regression 구현
model = LogisticRegression()
model.fit(norm_train_x_data,train_t_data)

LogisticRegression()

In [8]:
# accuracy
acc = model.score(norm_test_x_data,test_t_data)
print(acc)

0.9851666666666666


In [9]:
# knn으로 구현
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(norm_train_x_data, train_t_data)
acc = knn_classifier.score(norm_test_x_data,test_t_data)
print('KNN의 Accuracy : {}'.format(acc)) 

KNN의 Accuracy : 0.998


In [11]:
# 오존량 예측 linear regression 구현 (tensorflow 2.x)
# 데이터 전처리 포함

from scipy import stats
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings(action='ignore')
df = pd.read_csv('./data/ozone.csv')
display(df.head())

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,,,14.3,56,5,5


In [13]:
x_data = df[['Solar.R','Wind','Temp']]  # dataframe(2차원)
t_data = df['Ozone']

# 1.먼저 독립변수에 대한 결측값을 찾아서 median으로 imputation
for col in x_data.columns:
    col_median = np.nanmedian(x_data[col])
    x_data[col].loc[x_data[col].isnull()] = col_median
    
# 2.독립변수에 대한 이상치 겁출 후 이상치를 제외한 나머지값들의 mean으로 이상치를 대체
zscore_threshold = 2.0

for col in x_data.columns:
    outlier = x_data[col][np.abs(stats.zscore(x_data[col])) > zscore_threshold]
    col_mean = np.mean(x_data.loc[~x_data[col].isin(outlier).col])
    x_data.loc[x_data[col].isin(outlier), col] = col_mean
    
# 3.종속변수에 대한 이상치를 검출한 후 이상치를 제외한 나머지값의 평균으로 이상치 대체
outlier = t_data[np.abs(stats.zsocre(t_data))> zscore_threshold]
col_mean = np.mean(~t_data.isin(outlier))
t_data[t_data.isin(outlier)] = col_mean

# 4.정규화 진행
scaler_x = MinMaxScaler()
scaler_t = MinMaxScaler()

scaler_x.fit(x_data.values)          # scaler는 2차원 ndarray로 사용해야함
scaler_t.fit(t_data.values.reshape(-1,1))

norm_x_data = scaler_x.transform(x_data.values)
norm_t_data = scaler_t.transform(t_data.values.reshape(-1,1)).ravel

# 5.종속변수의 결측값은 KNN을 이용해서 예측값을 사용함
# 종속변수가 nan이 아닌 독립변수들과 종속변수들을 추출(KNN을 학습하기 위해)
norm_train_x_data = norm_x_data[~np.isnan(norm_t_data)]
norm_train_t_data = norm_t_data[~np.isnan(norm_t_data)]

knn_regressor = KNeighborsRegressor(n_neighbors=2)
knn_regressor.fit(norm_train_x_data, norm_train_t_data)

# 종속변수가 결측값인 독립변수들을 입력으로 넣어서 값을 예측
knn_predict = knn_regressor.predict(norm_x_data[np.isnan(norm_t_data)])
print(knn_predict)
norm_t_data[np.isnan(norm_t_data)] = knn_predict

AttributeError: 'Series' object has no attribute 'col'

In [2]:
# 최종적으로 데이터는 norm_x_data, norm_t_data

# sklearn구현과 tensorflow 2,x으로 구현할거임
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.optimizers import SGD

test_data = np.array([[330,15,80]])  # 태양광 330, 바람 15, 온도 80

In [3]:
# sklearn 구현
model = LinearRegression()
model.fit(norm_x_data,norm_t_data)
result = model.predict(scaler_x.transform(test_data))
print('sklearn 예측값 : {}'.format(scaler_t.inverse_transform(result.reshape(-1,1))))

NameError: name 'norm_x_data' is not defined

In [None]:
# tensorflow 구현
keras_model = Sequential()
keras_model.add(Flatten(input_shape=(3,)))  # input layer
keras_model.add(Dense(units=1,
                     activation='linear'))   # output layer
keras_model.compile(optimizer=SGD(learning_rate=1e-2),
                   loss='mse')

keras_model.fit(norm_x_data,
               norm_t_data,
               epochs=5000,
               verbose=0)

In [6]:
# logistic regression
# binary classification을 sklearn과 tensorflow 2.x로 구현
# kaggle에서 다운받은 titanic 데이터 사용

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import 


SyntaxError: invalid syntax (492353431.py, line 9)

In [17]:
# data loading
df = pd.read_csv('./data/train.csv')
display(df.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [18]:
# 필요없는 column(feature)는 삭제처리
data = df.drop(['PassengerId','Name','Ticket','Fare','Cabin'],
            axis=1, inplace=False)
display(data.head())


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [19]:
# SibSp = 함께 탄 형제,배우자 Parch= 함께 탄 부모,자식
# 두 칼럼은 하나로 합칠거임 => 동승자 수
data['Family'] = data['SibSp']+data['Parch']
data.drop(['SibSp','Parch'],axis=1, inplace=True)
display(data)

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,male,22.0,S,1
1,1,1,female,38.0,C,1
2,1,3,female,26.0,S,0
3,1,1,female,35.0,S,1
4,0,3,male,35.0,S,0
...,...,...,...,...,...,...
886,0,2,male,27.0,S,0
887,1,1,female,19.0,S,0
888,0,3,female,,S,3
889,1,1,male,26.0,C,0


In [20]:
# 결측치 확인
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Embarked      2
Family        0
dtype: int64

In [21]:
# embarked 에 결측치가 2개임 최빈값을 이용해서 채움, 여기서는 그냥 Q로 넣음
data['Embarked'] = data['Embarked'].fillna('Q')

In [22]:
# Age에는 결측치가 177개가 있음 이건 평균을 채울거임
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [25]:
# 문자로 되어있는 값은 숫자로 변경
gender_string={'male':0,'female':1}
data['Sex'] = df['Sex'].map(gender_string)

embarked_string = {'S':0,'C':1,'Q':2}
data['Embarked'] = data['Embarked'].map(embarked_string)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,22.0,0,1
1,1,1,1,38.0,1,1
2,1,3,1,26.0,0,0
3,1,1,1,35.0,0,1
4,0,3,0,35.0,0,0


In [26]:
# age 칼럼을 카데고리화 하기
def age_category(age):
    if((age>=0) & (age<25)):
        return 0
    elif ((age>=25) & (age<50)):
        return 1
    else:
        return 2
data['Age'] = data['Age'].map(age_category)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,0,0,0,1
1,1,1,1,1,1,1
2,1,3,1,1,0,0
3,1,1,1,1,0,1
4,0,3,0,1,0,0


In [None]:
# data split
train_x_data, test_x_data, train_t_data,test_t_data = \
train_test_split(data.drop('Survived',axis=1,inplace=False),
                data['Survived'],
                test_size=0.3,
                random_state=1,
                stratify=df['Survived'])

In [None]:
# 정규화
scaler = MinMaxScaler()
scaler.fit(train_x_data)
norm_train_x_data = scaler.transform(train_x_data)
norm_test_x_data = scaler.transform(test_x_data)

In [None]:
# sklearn 구현
model = LogisticRegression()
model.fit(norm_train_x_data,train_t_data)

sklearn_result = model.score(norm_test_x_data,test_t_data)
print('sklearn 정확도 :',sklearn_result)

In [None]:
# tensorflow 구현
keras_model = Sequential()
keras_model.add(Flatten(input_shape=(5,)))
keras_model.add(Dense(units=1,activation='sigmoid'))
keras_model.compile(optimizer=SGD(learning_rate=1e-2),
                   loss='binary_crossentropy',
                   metrics=['accuracy'])
keras_model.fit(norm_train_x_data,train_t_data,epochs=1000,verbose=0)
keras_result = keras_model.evaluate(norm_test_x_data,test_t_data)
print('TF2.x 정확도',keras_result)