In [1]:
## Importing the required libraries
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
## Step 1: Importing data from source
dataset = pd.read_csv("/Users/malarvizhis/Desktop/Pokemon.csv")

In [3]:
## Analyzing the structure and aspects of data
print(dataset.head(5))
print(dataset.shape)
print(dataset.index)
print(dataset.columns)

   #                   Name Type 1  Type 2  Total  HP  Attack  Defense  \
0  1              Bulbasaur  Grass  Poison    318  45      49       49   
1  2                Ivysaur  Grass  Poison    405  60      62       63   
2  3               Venusaur  Grass  Poison    525  80      82       83   
3  3  VenusaurMega Venusaur  Grass  Poison    625  80     100      123   
4  4             Charmander   Fire     NaN    309  39      52       43   

   Sp. Atk  Sp. Def  Speed  Generation  Legendary  
0       65       65     45           1      False  
1       80       80     60           1      False  
2      100      100     80           1      False  
3      122      120     80           1      False  
4       60       50     65           1      False  
(800, 13)
RangeIndex(start=0, stop=800, step=1)
Index([u'#', u'Name', u'Type 1', u'Type 2', u'Total', u'HP', u'Attack',
       u'Defense', u'Sp. Atk', u'Sp. Def', u'Speed', u'Generation',
       u'Legendary'],
      dtype='object')


In [5]:
## Processing the Data
## Handling of missing values
## Here are two techniques to handle missing values.




# Technique 1: we either delete a particular row if it has a null value for a particular feature and a particular column if it has more than 75% of missing values.This method is advisable only we have enough dataset
## Note: Deletion of Data will lead to loss of information which might affect the prediction results.


dataset.dropna(inplace=True)
print(dataset.isnull().sum())
print(dataset.shape)

## result you can see there is reduction in the rows nearly half of the source dataset. - if you are running cell it may affect below cells

#             0
Name          0
Type 1        0
Type 2        0
Total         0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64
(414, 13)


In [4]:
## Since the dataset is quite minimal, Let's try the alternative method for handling missing values

## Technique 2 - This can be applied on features which has numerical data like year, values etc. This is an approximation which adds variance to the dataset but can avoid loss of data
## It's a standard technique and for this dataset we have mix of both numerical and categorical data.

## Numerical NaN 
## In the given dataset below are the features with numerical values

## Features :[Total,HP,Attack,Defense,Sp.Atk,Sp.Def,Speed]
## Note: Generation is a numerical value but those values are categorical so we are not considering it.

# print(dataset['Total'].mean())
# print(dataset['Total'].tail())

dataset['Total']= dataset['Total'].fillna(dataset['Total'].mean())



## Similar technique to be adopted for other numerical columns

# print(dataset['HP'].mean())
# print(dataset['HP'].tail())

dataset['HP']= dataset['HP'].replace(np.NaN,dataset['HP'].mean())

# print(dataset['Attack'].mean())
# print(dataset['Attack'].tail())

dataset['Attack'] = dataset['Attack'].replace(np.NaN,dataset['Attack'].mean())
# print(dataset['Defense'].mean())
# print(dataset['Defense'].tail())

dataset['Defense'] = dataset['Defense'].replace(np.NaN,dataset['Defense'].mean())
# print(dataset['Sp. Atk'].mean())
# print(dataset['Sp. Atk'].tail())

dataset['Sp. Atk'] = dataset['Sp. Atk'].replace(np.NaN,dataset['Sp. Atk'].mean())
# print(dataset['Sp. Def'].mean())
# print(dataset['Sp. Def'].tail())

dataset['Sp. Def'] = dataset['Sp. Def'].replace(np.NaN,dataset['Sp. Def'].mean())

# print(dataset['Speed'].mean())
# print(dataset['Speed'].tail())

dataset['Speed'] = dataset['Speed'].replace(np.NaN,dataset['Speed'].mean())

print(dataset['Speed'])

print(dataset.isna().any())

0       45
1       60
2       80
3       80
4       65
5       80
6      100
7      100
8      100
9       43
10      58
11      78
12      78
13      45
14      30
15      70
16      50
17      35
18      75
19     145
20      56
21      71
22     101
23     121
24      72
25      97
26      70
27     100
28      55
29      80
      ... 
770     60
771    118
772    101
773     50
774     40
775     60
776     80
777     75
778     38
779     56
780     51
781     56
782     46
783     41
784     84
785     99
786     69
787     54
788     28
789     28
790     55
791    123
792     99
793     99
794     95
795     50
796    110
797     70
798     80
799     70
Name: Speed, Length: 800, dtype: int64
#             False
Name          False
Type 1        False
Type 2         True
Total         False
HP            False
Attack        False
Defense       False
Sp. Atk       False
Sp. Def       False
Speed         False
Generation    False
Legendary     False
dtype: bool


In [291]:
## Categorical NaN
## Handling of Categorical data is quite tricker and we got to be careful about it's pre-processing techniques.There are multiple ways fro different use-cases
## Our dataset has following features to be categorical
## Features:[ Type 1, Type 2, Generation, Legendary]
## The two major ways are the following

# 1. Replacing the missing values with frequent occurence of value in that column.


dataset['Type 2'].value_counts()
def replace_most_common(x):
    if pd.isnull(x):
        return most_common
    else:
        return x
dataset1 = dataset['Type 2'].map(replace_most_common)

## This is just for understanding but we got to go with different approach.
## If you run this cell, it may affect below ones

In [5]:
## 2. Label Encoding 
##  LabelEncoder encode labels with a value between 0 and n_classes-1 where n is the number of distinct labels. If a label repeats it assigns the same value to as assigned earlier.

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

dataset1 = dataset
ds = dataset1[['Type 1','Type 2','Generation','Legendary']]
print(dataset['Total'])
#print(ds)
X = ds.iloc[:,:4].values
print(X)
#print(dataset.tail())

#X[:,0]=label_encoder.fit_transform(X[:,0])
#print(X)
X[:,1]=label_encoder.fit_transform(X[:,1].astype(str))
X[:,2]=label_encoder.fit_transform(X[:,2])
X[:,3]=label_encoder.fit_transform(X[:,3])

##print(X[:,1])
 
columns = ['Type 1','Type 2','Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed','Generation','Legendary']

Type1 = pd.DataFrame(X[:,0])
Type2 = pd.DataFrame(X[:,1])
Total = pd.DataFrame(dataset['Total'])
HP = pd.DataFrame(dataset['HP'])
Attack = pd.DataFrame(dataset['Attack'])
Defense = pd.DataFrame(dataset['Defense'])
SpAtk = pd.DataFrame(dataset['Sp. Atk'])
SpDef= pd.DataFrame(dataset['Sp. Def'])
Speed= pd.DataFrame(dataset['Speed'])
Generation= pd.DataFrame(X[:,2])
Legendary= pd.DataFrame(X[:,3])

encoded_dataset = pd.DataFrame()
encoded_dataset = pd.concat([encoded_dataset,Type1,Type2,Total,HP,Attack,Defense,SpAtk,SpDef,Speed,Generation,Legendary],axis =1)
encoded_dataset.columns = columns
print(encoded_dataset.columns)


print(encoded_dataset.isna().any())
## The problem here is, since there are different numbers in the same column, 
## the model will misunderstand the data to be in some kind of order, 0 < 1 < 2. But this isn’t the case at all. 
## To overcome this problem, we use One Hot Encoder.

0      318
1      405
2      525
3      625
4      309
5      405
6      534
7      634
8      634
9      314
10     405
11     530
12     630
13     195
14     205
15     395
16     195
17     205
18     395
19     495
20     251
21     349
22     479
23     579
24     253
25     413
26     262
27     442
28     288
29     438
      ... 
770    525
771    500
772    431
773    500
774    300
775    452
776    600
777    470
778    309
779    474
780    335
781    335
782    335
783    335
784    494
785    494
786    494
787    494
788    304
789    514
790    245
791    535
792    680
793    680
794    600
795    600
796    700
797    600
798    680
799    600
Name: Total, Length: 800, dtype: int64
[['Grass' 'Poison' 1 False]
 ['Grass' 'Poison' 1 False]
 ['Grass' 'Poison' 1 False]
 ...
 ['Psychic' 'Ghost' 6 True]
 ['Psychic' 'Dark' 6 True]
 ['Fire' 'Water' 6 True]]
Index([u'Type 1', u'Type 2', u'Total', u'HP', u'Attack', u'Defense',
       u'Sp. Atk', u'Sp. Def', u'Speed', u'Generati

In [6]:
## 3. One hot encoding - It is the most widespread approach, and it works very well unless your categorical variable takes on a large number of values 
## (i.e. you generally won't it for variables taking more than 15 different values. It'd be a poor choice in some cases with fewer values, though that varies.)

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features=[1,-1])
X = onehotencoder.fit_transform(X[:,1:])
print(X)

## In this example, the categories type 2 and legendary are one-hot encoded


  (0, 6)	1.0
  (0, 0)	1.0
  (1, 6)	1.0
  (1, 0)	1.0
  (2, 6)	1.0
  (2, 0)	1.0
  (3, 6)	1.0
  (3, 0)	1.0
  (4, 6)	1.0
  (4, 0)	1.0
  (5, 6)	1.0
  (5, 0)	1.0
  (6, 6)	1.0
  (6, 0)	1.0
  (7, 6)	1.0
  (7, 0)	1.0
  (8, 6)	1.0
  (8, 0)	1.0
  (9, 6)	1.0
  (9, 0)	1.0
  (10, 6)	1.0
  (10, 0)	1.0
  (11, 6)	1.0
  (11, 0)	1.0
  (12, 6)	1.0
  :	:
  (775, 8)	18.0
  (776, 8)	18.0
  (777, 8)	4.0
  (778, 8)	9.0
  (779, 8)	9.0
  (780, 8)	9.0
  (781, 8)	9.0
  (782, 8)	9.0
  (783, 8)	9.0
  (784, 8)	9.0
  (785, 8)	9.0
  (786, 8)	9.0
  (787, 8)	9.0
  (788, 8)	18.0
  (789, 8)	18.0
  (790, 8)	2.0
  (791, 8)	2.0
  (792, 8)	18.0
  (793, 8)	7.0
  (794, 8)	10.0
  (795, 8)	4.0
  (796, 8)	4.0
  (797, 8)	8.0
  (798, 8)	1.0
  (799, 8)	17.0




In [6]:
## 4. Creating dummies is another method of handling categorical data and it is somewhat similar to one hot encoding 
## Dummy Variables is one that takes the value 0 or 1 to indicate the absence or presence of some categorical effect that may be expected to shift the outcome.
## Number of columns = number of category values

dummy = pd.get_dummies(encoded_dataset['Type 1'])
print(dummy.columns)
tdataset = dataset[['#', 'Name']]
transformed_dataset = pd.concat([tdataset,encoded_dataset],axis = 1)
transformed_dataset = pd.concat([transformed_dataset,dummy],axis =1)
transformed_dataset = transformed_dataset.drop(['Type 1'],axis = 1)

print(transformed_dataset)



## 5. Sometimes, we use KNN Imputation(for Categorical variables): In this method of imputation, 
## the missing values of an attribute are imputed using the given number of attributes that are most similar to the attribute whose values are missing. 
## The similarity of two attributes is determined using a distance function, but we are going to stop our experiment only with dummies.

Index([u'Bug', u'Dark', u'Dragon', u'Electric', u'Fairy', u'Fighting', u'Fire',
       u'Flying', u'Ghost', u'Grass', u'Ground', u'Ice', u'Normal', u'Poison',
       u'Psychic', u'Rock', u'Steel', u'Water'],
      dtype='object')
       #                       Name Type 2  Total   HP  Attack  Defense  \
0      1                  Bulbasaur     13    318   45      49       49   
1      2                    Ivysaur     13    405   60      62       63   
2      3                   Venusaur     13    525   80      82       83   
3      3      VenusaurMega Venusaur     13    625   80     100      123   
4      4                 Charmander     18    309   39      52       43   
5      5                 Charmeleon     18    405   58      64       58   
6      6                  Charizard      7    534   78      84       78   
7      6  CharizardMega Charizard X      2    634   78     130      111   
8      6  CharizardMega Charizard Y      7    634   78     104       78   
9      7            

In [19]:
# 'Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',
#        'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',
#        'Psychic', 'Rock', 'Steel', 'Water'
print(transformed_dataset.columns)

## Eliminating the name columns as we have '#' 
X = transformed_dataset[['#','Total','HP','Attack','Defense','Sp. Atk',
       'Sp. Def', 'Speed', 'Generation','Legendary','Bug', 'Dark', 'Dragon',
       'Electric', 'Fairy', 'Fighting', 'Fire', 'Flying', 'Ghost', 'Grass',
       'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock', 'Steel',
       'Water']]
print(X.isna().any())
y = transformed_dataset[['Type 2']]
y = y.astype('float')
print(y.isna().any())

Index([u'#', u'Name', u'Type 2', u'Total', u'HP', u'Attack', u'Defense',
       u'Sp. Atk', u'Sp. Def', u'Speed', u'Generation', u'Legendary', u'Bug',
       u'Dark', u'Dragon', u'Electric', u'Fairy', u'Fighting', u'Fire',
       u'Flying', u'Ghost', u'Grass', u'Ground', u'Ice', u'Normal', u'Poison',
       u'Psychic', u'Rock', u'Steel', u'Water'],
      dtype='object')
#             False
Total         False
HP            False
Attack        False
Defense       False
Sp. Atk       False
Sp. Def       False
Speed         False
Generation    False
Legendary     False
Bug           False
Dark          False
Dragon        False
Electric      False
Fairy         False
Fighting      False
Fire          False
Flying        False
Ghost         False
Grass         False
Ground        False
Ice           False
Normal        False
Poison        False
Psychic       False
Rock          False
Steel         False
Water         False
dtype: bool
Type 2    False
dtype: bool


In [20]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test


In [21]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=1000)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [22]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

conf_mat=np.matrix(conf_mat)
FP = conf_mat.sum(axis=0) - np.diag(conf_mat)  
FN = conf_mat.sum(axis=1) - np.diag(conf_mat)
TP = np.diag(conf_mat)
TN = conf_mat.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Precision or positive predictive value
PPV = TP/(TP+FP)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)

print('ACC',ACC)

('Accuracy:', 0.49166666666666664)
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    1]
 [  0   0   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   0
    5]
 [  0   0   0   0   0   0   0   2   0   1   0   0   0   0   0   0   0   0
    5]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    2]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    4]
 [  0   0   0   0   0   1   0   1   0   0   0   0   0   0   0   0   0   0
    7]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    2]
 [  0   0   0   0   0   0   0   8   0   0   0   0   0   0   0   0   1   0
   18]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    3]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   1   0
    5]
 [  0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   1   0
    8]
 [  0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0


In [14]:
## This exercise of work is for demonstrating pre-processing techniques only, The model can give around 50% accuracy for now.
## We got to apply some more data to make it improve it's accuracy as well hyper tuning of parameters in the algorithm.

## The overall problem that the solution covers is to identify type 2 of the pokemon using other features in the dataset.