In [68]:
## Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [69]:
## Step 1: Importing data from source
dataset = pd.read_csv("../input/Pokemon.csv")

In [70]:
## Analyzing the structure and aspects of data
print(dataset.head(5))
print(dataset.shape)
print(dataset.index)
print(dataset.columns)

   #                   Name Type 1    ...     Speed  Generation  Legendary
0  1              Bulbasaur  Grass    ...        45           1      False
1  2                Ivysaur  Grass    ...        60           1      False
2  3               Venusaur  Grass    ...        80           1      False
3  3  VenusaurMega Venusaur  Grass    ...        80           1      False
4  4             Charmander   Fire    ...        65           1      False

[5 rows x 13 columns]
(800, 13)
RangeIndex(start=0, stop=800, step=1)
Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
       'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')


In [71]:
## Since the dataset is quite minimal, Let's try the alternative method for handling missing values

## Technique 2 - This can be applied on features which has numerical data like year, values etc. This is an approximation which adds variance to the dataset but can avoid loss of data
## It's a standard technique and for this dataset we have mix of both numerical and categorical data.

## Numerical NaN 
## In the given dataset below are the features with numerical values

## Features :[Total,HP,Attack,Defense,Sp.Atk,Sp.Def,Speed]
## Note: Generation is a numerical value but those values are categorical so we are not considering it.

# print(dataset['Total'].mean())
# print(dataset['Total'].tail())

dataset['Total']= dataset['Total'].fillna(dataset['Total'].mean())



## Similar technique to be adopted for other numerical columns

# print(dataset['HP'].mean())
# print(dataset['HP'].tail())

dataset['HP']= dataset['HP'].replace(np.NaN,dataset['HP'].mean())

# print(dataset['Attack'].mean())
# print(dataset['Attack'].tail())

dataset['Attack'] = dataset['Attack'].replace(np.NaN,dataset['Attack'].mean())
# print(dataset['Defense'].mean())
# print(dataset['Defense'].tail())

dataset['Defense'] = dataset['Defense'].replace(np.NaN,dataset['Defense'].mean())
# print(dataset['Sp. Atk'].mean())
# print(dataset['Sp. Atk'].tail())

dataset['Sp. Atk'] = dataset['Sp. Atk'].replace(np.NaN,dataset['Sp. Atk'].mean())
# print(dataset['Sp. Def'].mean())
# print(dataset['Sp. Def'].tail())

dataset['Sp. Def'] = dataset['Sp. Def'].replace(np.NaN,dataset['Sp. Def'].mean())

# print(dataset['Speed'].mean())
# print(dataset['Speed'].tail())

dataset['Speed'] = dataset['Speed'].replace(np.NaN,dataset['Speed'].mean())

print(dataset['Speed'])
print(dataset.isna().any())


0       45
1       60
2       80
3       80
4       65
5       80
6      100
7      100
8      100
9       43
10      58
11      78
12      78
13      45
14      30
15      70
16      50
17      35
18      75
19     145
20      56
21      71
22     101
23     121
24      72
25      97
26      70
27     100
28      55
29      80
      ... 
770     60
771    118
772    101
773     50
774     40
775     60
776     80
777     75
778     38
779     56
780     51
781     56
782     46
783     41
784     84
785     99
786     69
787     54
788     28
789     28
790     55
791    123
792     99
793     99
794     95
795     50
796    110
797     70
798     80
799     70
Name: Speed, Length: 800, dtype: int64
#             False
Name          False
Type 1        False
Type 2         True
Total         False
HP            False
Attack        False
Defense       False
Sp. Atk       False
Sp. Def       False
Speed         False
Generation    False
Legendary     False
dtype: bool


In [72]:
## 2. Label Encoding 
##  LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels. If a label repeats it assigns the same value to as assigned earlier.

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

dataset1 = dataset
ds = dataset1[['Type 1','Type 2','Generation','Legendary']]
print(dataset['Total'])
#print(ds)
X = ds.iloc[:,:4].values
print(X)
#print(dataset.tail())

X[:,0]=label_encoder.fit_transform(X[:,0])
#print(X)
#X[:,1]=label_encoder.fit_transform(X[:,1].astype(str))
X[:,2]=label_encoder.fit_transform(X[:,2])
X[:,3]=label_encoder.fit_transform(X[:,3])

##print(X[:,1])
 
columns = ['Type 1','Type 2','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed','Generation','Legendary']

Type1 = pd.DataFrame(X[:,0])
Type2 = pd.DataFrame(X[:,1])
Total = pd.DataFrame(dataset['Total'])
HP = pd.DataFrame(dataset['HP'])
Attack = pd.DataFrame(dataset['Attack'])
Defense = pd.DataFrame(dataset['Defense'])
SpAtk = pd.DataFrame(dataset['Sp. Atk'])
SpDef= pd.DataFrame(dataset['Sp. Def'])
Speed= pd.DataFrame(dataset['Speed'])
Generation= pd.DataFrame(X[:,2])
Legendary= pd.DataFrame(X[:,3])

encoded_dataset = pd.DataFrame()
encoded_dataset = pd.concat([encoded_dataset,Type1,Type2,Total,HP,Attack,Defense,SpAtk,SpDef,Speed,Generation,Legendary],axis =1)
encoded_dataset.columns = columns
print(encoded_dataset.columns)
## The problem here is, since there are different numbers in the same column, 
## the model will misunderstand the data to be in some kind of order, 0 < 1 < 2. But this isn’t the case at all. 
## To overcome this problem, we use One Hot Encoder.

0      318
1      405
2      525
3      625
4      309
5      405
6      534
7      634
8      634
9      314
10     405
11     530
12     630
13     195
14     205
15     395
16     195
17     205
18     395
19     495
20     251
21     349
22     479
23     579
24     253
25     413
26     262
27     442
28     288
29     438
      ... 
770    525
771    500
772    431
773    500
774    300
775    452
776    600
777    470
778    309
779    474
780    335
781    335
782    335
783    335
784    494
785    494
786    494
787    494
788    304
789    514
790    245
791    535
792    680
793    680
794    600
795    600
796    700
797    600
798    680
799    600
Name: Total, Length: 800, dtype: int64
[['Grass' 'Poison' 1 False]
 ['Grass' 'Poison' 1 False]
 ['Grass' 'Poison' 1 False]
 ...
 ['Psychic' 'Ghost' 6 True]
 ['Psychic' 'Dark' 6 True]
 ['Fire' 'Water' 6 True]]
Index(['Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Lege

In [73]:
## 4. Creating dummies is another method of handling categorical data and it is somewhat similar to one hot encoding 
## Dummy Variables is one that takes the value 0 or 1 to indicate the absence or presence of some categorical effect that may be expected to shift the outcome.
## Number of columns = number of category values

dummy = pd.get_dummies(encoded_dataset['Type 2'])
print(dummy.columns)
tdataset = dataset[['#', 'Name']]
transformed_dataset = pd.concat([tdataset,encoded_dataset],axis = 1)
transformed_dataset = pd.concat([transformed_dataset,dummy],axis =1)
transformed_dataset = transformed_dataset.drop(['Type 2'],axis = 1)

print(transformed_dataset)



## 5. Sometimes, we use KNN Imputation(for Categorical variables): In this method of imputation, 
## the missing values of an attribute are imputed using the given number of attributes that are most similar to the attribute whose values are missing. 
## The similarity of two attributes is determined using a distance function, but we are going to stop our experiment only with dummies.

Index(['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',
       'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
       'Psychic', 'Rock', 'Steel', 'Water'],
      dtype='object')
       #                       Name Type 1  ...    Rock  Steel  Water
0      1                  Bulbasaur      9  ...       0      0      0
1      2                    Ivysaur      9  ...       0      0      0
2      3                   Venusaur      9  ...       0      0      0
3      3      VenusaurMega Venusaur      9  ...       0      0      0
4      4                 Charmander      6  ...       0      0      0
5      5                 Charmeleon      6  ...       0      0      0
6      6                  Charizard      6  ...       0      0      0
7      6  CharizardMega Charizard X      6  ...       0      0      0
8      6  CharizardMega Charizard Y      6  ...       0      0      0
9      7                   Squirtle     17  ...       0      0      0
10     8          

In [74]:
# 'Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',
#        'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
#        'Psychic', 'Rock', 'Steel', 'Water'
print(transformed_dataset.columns)

## Eliminating the name columns as we have '#' 
X = transformed_dataset[['#','Total','HP','Attack','Defense','Sp. Atk',
       'Sp. Def', 'Speed', 'Generation','Legendary','Bug', 'Dark', 'Dragon',
       'Electric', 'Fairy', 'Fighting', 'Fire', 'Flying', 'Ghost', 'Grass',
       'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock', 'Steel',
       'Water']]
y = transformed_dataset[['Type 1']]
y=y.astype('long')
print(X.isna().any())
print(y.isna().any())

print(np.where(y.values >= np.finfo(np.float64).max))

Index(['#', 'Name', 'Type 1', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Legendary', 'Bug', 'Dark', 'Dragon',
       'Electric', 'Fairy', 'Fighting', 'Fire', 'Flying', 'Ghost', 'Grass',
       'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock', 'Steel',
       'Water'],
      dtype='object')
#             False
Total         False
HP            False
Attack        False
Defense       False
Sp. Atk       False
Sp. Def       False
Speed         False
Generation    False
Legendary     False
Bug           False
Dark          False
Dragon        False
Electric      False
Fairy         False
Fighting      False
Fire          False
Flying        False
Ghost         False
Grass         False
Ground        False
Ice           False
Normal        False
Poison        False
Psychic       False
Rock          False
Steel         False
Water         False
dtype: bool
Type 1    False
dtype: bool
(array([], dtype=int64), array([], dtype=int64))


In [None]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test


In [75]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [76]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
#print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

conf_mat=np.matrix(conf_mat)
FP = conf_mat.sum(axis=0) - np.diag(conf_mat)  
FN = conf_mat.sum(axis=1) - np.diag(conf_mat)
TP = np.diag(conf_mat)
TN = conf_mat.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('ACC',ACC)

[[ 8  0  0  0  0  0  0  1  1  0  0  1  0  0  0  1  1]
 [ 1  0  0  0  1  0  1  1  0  0  0  6  0  0  0  0  1]
 [ 1  0  6  1  0  0  0  0  0  0  0  2  0  1  0  0  1]
 [ 1  0  0  2  0  0  4  0  0  0  0  1  0  1  0  0  5]
 [ 0  0  0  0  1  0  0  0  0  0  0  2  0  1  0  0  2]
 [ 0  1  0  0  0  3  0  0  1  0  1  3  1  0  0  0  0]
 [ 1  0  0  3  0  1  6  0  2  0  0  1  0  1  0  1  3]
 [ 1  0  0  1  0  0  0  3  2  0  0  1  0  1  0  0  1]
 [ 2  0  0  0  0  0  2  0  4  0  0  3  0  3  1  0  7]
 [ 2  0  0  0  0  0  0  0  0  1  1  4  0  0  2  2  2]
 [ 0  0  0  2  0  0  1  0  0  0  0  2  0  0  0  0  4]
 [ 2  0  1  0  0  1  1  0  2  0  0 18  1  0  1  0  1]
 [ 0  0  0  0  0  0  0  0  2  0  0  1  2  0  0  0  2]
 [ 0  0  1  1  0  0  0  0  1  0  0  2  0  8  0  0  4]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  2  3  0  4]
 [ 0  1  0  1  0  0  0  0  3  0  0  0  0  0  0  0  0]
 [ 3  1  0  3  1  0  3  0  4  1  0  3  0  2  1  1 10]]
ACC [[0.92083333 0.93333333 0.9625     0.90416667 0.94166667 0.95
  0.92083333 0.

In [None]:
## This exercise of work is for demonstrating pre-processing techniques only, The model can give around 50% accuracy for now.
## We got to apply some more data to make it improve it's accuracy as well hyper tuning of parameters in the algorithm.

## The overall problem that the solution covers is to identify type 1 of the pokemon using other features in the dataset.