In [91]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot

In [92]:
df_data = pd.read_csv('./data/Pokemon.csv')

df_data.drop(['#', 'Generation'], axis=1, inplace=True)

In [93]:
df_data.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,False
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,False
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,False
4,Charmander,Fire,,309,39,52,43,60,50,65,False


In [94]:
df_data.isnull().sum()

Name           0
Type 1         0
Type 2       386
Total          0
HP             0
Attack         0
Defense        0
Sp. Atk        0
Sp. Def        0
Speed          0
Legendary      0
dtype: int64

In [95]:
df_data.describe()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775
std,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474
min,180.0,1.0,5.0,5.0,10.0,20.0,5.0
25%,330.0,50.0,55.0,50.0,49.75,50.0,45.0
50%,450.0,65.0,75.0,70.0,65.0,70.0,65.0
75%,515.0,80.0,100.0,90.0,95.0,90.0,90.0
max,780.0,255.0,190.0,230.0,194.0,230.0,180.0


In [96]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Name       800 non-null    object
 1   Type 1     800 non-null    object
 2   Type 2     414 non-null    object
 3   Total      800 non-null    int64 
 4   HP         800 non-null    int64 
 5   Attack     800 non-null    int64 
 6   Defense    800 non-null    int64 
 7   Sp. Atk    800 non-null    int64 
 8   Sp. Def    800 non-null    int64 
 9   Speed      800 non-null    int64 
 10  Legendary  800 non-null    bool  
dtypes: bool(1), int64(7), object(3)
memory usage: 63.4+ KB


In [99]:
pd.get_dummies(data=df_data, columns=['Legendary'], prefix='Legendary')

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary_False,Legendary_True
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,0
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,0
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,0
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,0
4,Charmander,Fire,,309,39,52,43,60,50,65,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
795,Diancie,Rock,Fairy,600,50,100,150,100,150,50,0,1
796,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,0,1
797,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,0,1
798,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,0,1


In [100]:
corr = df_data.corr()

fig = go.Figure()
fig.add_trace(go.Heatmap(z=corr.values,
                   x=corr.index.values,
                   y=corr.columns.values))
 
fig.update_layout(autosize=False,
                  width=700,
                  height=700)

fig.show()

In [101]:
type_total = pd.concat([df_data['Type 1'], df_data['Type 2']], axis=0)

# Type 1 feature
pie_data = type_total.value_counts().values
pie_name = type_total.value_counts().index

fig = px.pie(values=pie_data, names=pie_name, title='The number of Pokemon type')
fig.update_layout(autosize=False,
                  width=700,
                  height=700)

fig.show()

In [102]:
df_data.rename(columns={'Sp. Atk' : 'Sp_Atk', 'Sp. Def' : 'Sp_Def'}, inplace=True)

# Extract Attack, Defense, Special Attack, Special Defense for each type
attack_total = pd.concat([df_data['Attack'], df_data['Attack']], axis=0)
defense_total = pd.concat([df_data['Defense'], df_data['Defense']], axis=0)
sp_attack_total = pd.concat([df_data['Sp_Atk'], df_data['Sp_Atk']], axis=0)
sp_defense_total = pd.concat([df_data['Sp_Def'], df_data['Sp_Def']], axis=0)

TAD = pd.concat([type_total, attack_total, defense_total, sp_attack_total, sp_defense_total], axis=1)

TAD.rename(columns={0 : 'Type'}, inplace=True)

Grouped1 = TAD['Attack'].groupby(TAD['Type'])
df1 = Grouped1.mean().reset_index()

Grouped2 = TAD['Defense'].groupby(TAD['Type'])
df2 = Grouped2.mean().reset_index()

sp_Grouped1 = TAD['Sp_Atk'].groupby(TAD['Type'])
sp_df1 = sp_Grouped1.mean().reset_index()

sp_Grouped2 = TAD['Sp_Def'].groupby(TAD['Type'])
sp_df2 = sp_Grouped2.mean().reset_index()


fig = go.Figure()
fig.add_trace(go.Scatter(x=df1['Type'], y=df1['Attack'], name='Average Attack', line=dict(color='firebrick', width=2)))
fig.add_trace(go.Scatter(x=df2['Type'], y=df2['Defense'], name='Average Defense', line=dict(color='royalblue', width=2)))
fig.add_trace(go.Scatter(x=sp_df1['Type'], y=sp_df1['Sp_Atk'], name='Average Special Attack', line=dict(color='firebrick', width=2, dash='dash')))
fig.add_trace(go.Scatter(x=sp_df2['Type'], y=sp_df2['Sp_Def'], name='Average Special Defense', line=dict(color='royalblue', width=2, dash='dash')))


fig.update_layout(title = 'Average stat for each Pokemon type',
                  xaxis_title = 'Type',
                  yaxis_title = 'Average Stat',
                  width = 1200,
                  height = 800)

fig.show()


In [103]:
# 민규형처럼 Type1, Type2를 합쳐서 onehot encoding

df_data.rename(columns={'Type 1' : 'Type1', 'Type 2' : 'Type2'}, inplace=True)

type1_dummies = pd.get_dummies(df_data.Type1)
type2_dummies = pd.get_dummies(df_data.Type2)

Type_onehot = type1_dummies + type2_dummies

df_data = pd.concat([df_data, Type_onehot], axis=1)

In [105]:
# Drop unnecessary feature
df_data.drop(['Name', 'Type1', 'Type2', "Total"], axis=1, inplace=True)

In [106]:
# 민규형 코드를 가져왔습니다.
from sklearn.preprocessing import MinMaxScaler, RobustScaler

def preprocessingX(input_df):
    r = RobustScaler().fit_transform(input_df)
    r = MinMaxScaler().fit_transform(r)
    #r = PCA(n_components=3).fit_transform(r)
    return r

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [109]:
df_train = df_data.drop(['Legendary'], axis=1)
df_test = df_data['Legendary']

rf_grid = GridSearchCV(RandomForestClassifier(), {'n_estimators':[100], 
                                            'max_depth':[5, 10, 13], 
                                            'min_samples_split': [2, 5, 10], 
                                            'random_state':[42],
                                           }, cv=10, scoring='accuracy')
rf_grid.fit(df_train, df_test.values.ravel())

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10, 13],
                         'min_samples_split': [2, 5, 10], 'n_estimators': [100],
                         'random_state': [42]},
             scoring='accuracy')

In [110]:
from sklearn.model_selection import cross_val_score

In [111]:
rf_best = rf_grid.best_estimator_
#knn_best = knn_grid.best_estimator_

print('----')
for m in [rf_best]:
    # y 가 2차원 어레이일 때 문제가 생겨서, ravel로 수정해줌
    print(m.__class__)
    print("train: {}".format(m.score(df_train, df_test.values.ravel())))
    print("----")

print( cross_val_score(rf_best, df_train, df_test.values.ravel(),cv=10,scoring='accuracy') )

----
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
train: 0.99
----
[0.975  0.95   0.9375 0.975  0.925  0.925  0.925  0.925  0.9625 0.925 ]
