In [1]:
# 데이터 생성
import pandas as pd
import numpy as np

# 함수를 사용하여 리스트 반환
def making_df(name, w_start, w_end, l_start, l_end):
  weight_array=np.random.uniform(w_start, w_end, 100)
  length_array=np.random.uniform(l_start, l_end, 100)
  return {'Species': [name]*100, 'Weight': weight_array,'Length':length_array}

# rabbit
rabbit=making_df('Rabbit', 0.2, 0.5, 0.4, 1.2)
df=pd.DataFrame(rabbit)
# dog
dog=making_df('Dog', 1, 2, 3, 6)
df_dog=pd.DataFrame(dog)
# cow
cow=making_df('Cow', 7, 10, 15, 20)
df_cow=pd.DataFrame(cow)
# pigeon
pigeon=making_df('Pigeon', 0.05, 0.1, 0.05, 0.1)
df_pigeon=pd.DataFrame(pigeon)
# horse
horse=making_df('Horse', 3, 5, 10, 12)
df_horse=pd.DataFrame(horse)

df_list=[df_dog, df_cow, df_pigeon, df_horse]

# 반복문을 사용하여 일괄 추가
for item in df_list:
  df=df.append(item, ignore_index=True)

print(df.info())
print()
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  500 non-null    object 
 1   Weight   500 non-null    float64
 2   Length   500 non-null    float64
dtypes: float64(2), object(1)
memory usage: 11.8+ KB
None

  Species    Weight    Length
0  Rabbit  0.281689  0.711008
1  Rabbit  0.347167  0.924494
2  Rabbit  0.274917  0.685911
3  Rabbit  0.488386  0.540756
4  Rabbit  0.329001  0.598236


In [2]:
# Altair 패키지를 사용하여 종별로 컬러 부여, 자료 시각화 
from altair import Chart

Chart(df).mark_point().encode(x='Weight', y='Length', color='Species')

In [3]:
# 데이터전처리
animal_input=df[['Length','Weight']].to_numpy()
animal_target=df['Species'].to_numpy()

# 훈련 및 테스트셋 구분
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
    animal_input, animal_target)


In [4]:
print(train_input[:5])

[[ 0.09964624  0.09258502]
 [10.2563402   3.01031877]
 [ 0.08556396  0.08650786]
 [ 4.73997102  1.77520905]
 [10.3081827   4.51181853]]


In [5]:
# 정규화
from sklearn.preprocessing import StandardScaler

ss=StandardScaler()
ss.fit(train_input)
train_scaled=ss.transform(train_input)
test_scaled=ss.transform(test_input)

In [6]:
from sklearn.linear_model import LogisticRegression

# 규제 및 반복 횟수 조정하여 학습 실행
lr=LogisticRegression(C=10, max_iter=500)
lr.fit(train_scaled, train_target)

print(lr.score(train_scaled, train_target))
print(lr.score(test_scaled, test_target))

0.9973333333333333
0.992


In [7]:
# 확률값 확인
print(lr.predict(test_scaled[:5]))
proba=lr.predict_proba(test_scaled[:5])
print(np.round(proba, decimals=3))

['Cow' 'Pigeon' 'Pigeon' 'Pigeon' 'Rabbit']
[[0.94  0.    0.06  0.    0.   ]
 [0.    0.023 0.    0.54  0.437]
 [0.    0.007 0.    0.78  0.213]
 [0.    0.007 0.    0.779 0.215]
 [0.    0.056 0.    0.329 0.615]]


In [8]:
# 학습 결과 확인 
for_test=np.array([0.5,1]).reshape(1,-1) # L: 0.5, W: 1.0 의 조건을 입력
for_test_scaled=ss.transform(for_test) # 앞서 행한 정규화에 맞추어 변환
print(lr.predict(for_test_scaled))

['Rabbit']


In [34]:
# 함수로 범인을 찾을 수 있도록 구현
def find_who(length, weight):
  for_answer=ss.transform(np.array([length, weight]).reshape(1,-1))
  answer=lr.predict(for_answer)[0]
  answer_prob=round(lr.predict_proba(for_answer).max(), 2)*100
  print('overall probability: {0}'.format(np.round(lr.predict_proba(for_answer), 3)))
  print('This is from ' + '\033[1m'+'{0} '.format(answer)+'\033[0m'+'with the probability of ' + '\033[1m'+ '{0}%'.format(answer_prob)+'\033[0m')


find_who(10, 4)


overall probability: [[0.005 0.045 0.951 0.    0.   ]]
This is from [1mHorse [0mwith the probability of [1m95.0%[0m
