<a href="https://colab.research.google.com/github/Nickguild1993/Practices_python_ML/blob/main/Pokemon_DS_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Pokemon Dataset from Kaggle for Python w/ Pandas for tutorial on cleaning, exploration, and analysis

Import dependencies (AKA packages in R). Think of Python as a toolbelt and these dependencies (or libraries) are additonal tools that allow to perform specific tasks.

In [1]:
# import dependencies 

# we "alias" the dependencies so we don't have to type out the full name. (import XXXXX as X)
# it's not required, but uh, there is no reason to make your code more efficent and concise.

# fun fact- pandas is shorthand for panel data
import pandas as pd
# numpy is what pandas is built on, it's older code, but still has unique and useful functions
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
# seaborn is a suped up version of matplotlib that gives you more control/options for visualizations
import seaborn as sns

Uploading the DataSet. This one is located locally, but in Colab, you can pull from your GDrive (most complex method, wouldn't really recommend), or by pulling it from your GitHub- which is the easiest method.

https://towardsdatascience.com/3-ways-to-load-csv-files-into-colab-7c14fcbdcb92#:~:text=Click%20on%20the%20dataset%20in,read_csv%20to%20get%20the%20dataframe.

In [2]:
# Upload dataset.  This one is stored locally

from google.colab import files
uploaded = files.upload()

Saving Pokemon.csv to Pokemon.csv


In [3]:
# read in data and set it as a DataFrame (tabular data format akin to an excel table)
import io

df = pd.read_csv(io.BytesIO(uploaded["Pokemon.csv"]))

In [4]:
# get a look at the data using .head() 
# default is 5 rows, but you put a numeric value in the () for a different amount

df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


### Data Cleaning / Exploration

In [5]:
# checking shape of dataset w/ .shape

print("The number of rows and columns are", df.shape, "respectfully.")

The number of rows and columns are (800, 13) respectfully.


In [6]:
# because we have already have the index serving as a "pokemon #", let's get rid of the "#" column

df = df.drop(columns=["#"])

# because we updated the dataframe (df) it will update globally, not just in this block

In [7]:
# Checking the DF to ensure the the column dropped correctly.  
# It's a good idea to not run the .head()/.tail()/ .sample() functions in the same cell
# because it can screw up your dataframe if you run the code twice. 

df.sample(3)


Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
593,Gurdurr,Fighting,,405,85,105,85,40,50,40,5,False
570,Pansage,Grass,,316,50,53,48,53,48,64,5,False
150,Omastar,Rock,Water,495,70,60,125,115,70,55,1,False


In [8]:
# Getting rid of NaN values 
# NaN values are "blank" values that can mess up analysis because they're not iterable

# You can tell that the "Type 2" has NaN values, but let's check the entire dataframe to make sure 
# that there aren't any others.

# using df. becuase that's the name of our DataFrame. if it was named "pokemon_table"
# we'd do:    display(pokemon_table.isnull().sum())
display(df.isnull().sum())

Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [9]:
# So, we have two options.  1). We could just get rid of all the values (pokemon) that have 
# NaN values for Type 2.  But that would delete more than 1/3 of the data and make any
# analysis shoddy, at best.

# Option two is use the replace function. This will replace a defined value with a new one.

# because the pokemon with a Type 2 value is a string (data type that means words, basically)
# we want to replace the NaN values with a string so we can evaluate the column easily. We'll use "None"

values = {"Type 2" : "None"}

# using df = .... because if we don't "update" the DataFrame (df) then running the function
# won't apply when you call the df outside of this cell (meaning it's not globalized)

df = df.fillna(value=values)

In [10]:
# checking to make sure we changed all the NaN values

display(df.isnull().sum())

# ladies and gentlemen, we got him.

Name          0
Type 1        0
Type 2        0
Total         0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64

### Examples of selecting specific column or rows that meet a specific critieria.

In [11]:
# selecting for a specific column value

df[df["Legendary"]==True].head(5) 

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
156,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
157,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
158,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
162,Mewtwo,Psychic,,680,106,110,90,154,90,130,1,True
163,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True


In [12]:
# creating a subset of only fire type pokemon

df_fire_type = df[df["Type 1"] == "Fire"]
print(df_fire_type.head(3))

print(df_fire_type.shape)
print("There are 52 pokemon that identify as Fire Pokemon")

         Name Type 1  Type 2  Total  ...  Sp. Def  Speed  Generation  Legendary
4  Charmander   Fire    None    309  ...       50     65           1      False
5  Charmeleon   Fire    None    405  ...       65     80           1      False
6   Charizard   Fire  Flying    534  ...       85    100           1      False

[3 rows x 12 columns]
(52, 12)
There are 52 pokemon that identify as Fire Pokemon


In [13]:
# Addiotional example of doing a sub-select query.  

hp_check = df[df["HP"] < 85]
print(hp_check.head())

print(hp_check.shape)
print("The number of pokemon who have an HP value less than 85 is:", 615)

                    Name Type 1  Type 2  ...  Speed  Generation  Legendary
0              Bulbasaur  Grass  Poison  ...     45           1      False
1                Ivysaur  Grass  Poison  ...     60           1      False
2               Venusaur  Grass  Poison  ...     80           1      False
3  VenusaurMega Venusaur  Grass  Poison  ...     80           1      False
4             Charmander   Fire    None  ...     65           1      False

[5 rows x 12 columns]
(615, 12)
The number of pokemon who have an HP value less than 85 is: 615


In [15]:
# One more sub select query example - selecting all pokemon that are generation 1.

df_gen_one = df[df["Generation"]== 1]
print(df_gen_one)

                      Name   Type 1    Type 2  ...  Speed  Generation  Legendary
0                Bulbasaur    Grass    Poison  ...     45           1      False
1                  Ivysaur    Grass    Poison  ...     60           1      False
2                 Venusaur    Grass    Poison  ...     80           1      False
3    VenusaurMega Venusaur    Grass    Poison  ...     80           1      False
4               Charmander     Fire      None  ...     65           1      False
..                     ...      ...       ...  ...    ...         ...        ...
161              Dragonite   Dragon    Flying  ...     80           1      False
162                 Mewtwo  Psychic      None  ...    130           1       True
163    MewtwoMega Mewtwo X  Psychic  Fighting  ...    130           1       True
164    MewtwoMega Mewtwo Y  Psychic      None  ...    140           1       True
165                    Mew  Psychic      None  ...    100           1      False

[166 rows x 12 columns]


In [16]:
# Because we grew up with only generation 1 pokemon, I want to remove any values
# that "generation" != (python operator for does not match) generation =1

# Making a new variable- df_1 to represent this subset from the original df
# Also, this will prevent you from screwing up your original dataframe if you make a mistake!

df_1 = df

In [17]:
# Getting rid of all Pokemon that are not generation 1.

index_names = df_1[df_1["Generation"] != 1].index

df_1.drop(index_names, inplace=True)

df_1.head(3)

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False


In [18]:
# Check the shape of the subset that we're going to work with.

print(df_1.shape)

# since it works, we'll update df to be in line with df_1.

df = df_1

(166, 12)


### .iloc[ ] usage

In [19]:
# Selecting a specific row(s) using .iloc[]

# index selecting in pandas starts at 0
df.iloc[0]

Name          Bulbasaur
Type 1            Grass
Type 2           Poison
Total               318
HP                   45
Attack               49
Defense              49
Sp. Atk              65
Sp. Def              65
Speed                45
Generation            1
Legendary         False
Name: 0, dtype: object

In [21]:
# Retrieving a column with .iloc[]

df.iloc[:,0]

df.iloc[:,0]

0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
161                Dragonite
162                   Mewtwo
163      MewtwoMega Mewtwo X
164      MewtwoMega Mewtwo Y
165                      Mew
Name: Name, Length: 166, dtype: object

In [24]:
# Another select column example

df.iloc[:,2:3]

Unnamed: 0,Type 2
0,Poison
1,Poison
2,Poison
3,Poison
4,
...,...
161,Flying
162,
163,Fighting
164,


In [26]:
# selecting the first 5 entries in column 0 (the first column)

# you don't have to put the :1 after the 0, but if you don't it won't return as a DataFrame
df.iloc[:5, 0:5]


Unnamed: 0,Name,Type 1,Type 2,Total,HP
0,Bulbasaur,Grass,Poison,318,45
1,Ivysaur,Grass,Poison,405,60
2,Venusaur,Grass,Poison,525,80
3,VenusaurMega Venusaur,Grass,Poison,625,80
4,Charmander,Fire,,309,39


In [28]:
# selecting multiple rows with mutliple columns

df.iloc[:4, 0:3]

Unnamed: 0,Name,Type 1,Type 2
0,Bulbasaur,Grass,Poison
1,Ivysaur,Grass,Poison
2,Venusaur,Grass,Poison
3,VenusaurMega Venusaur,Grass,Poison


In [27]:
df.iloc[:4,4:9] 

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def
0,45,49,49,65,65
1,60,62,63,80,80
2,80,82,83,100,100
3,80,100,123,122,120


In [29]:
# Can also use negative numbers in selection.  It starts counting forward (up) from last value (row)

# didn't specify which columns I wanted, so it defaulted to all columns
df.iloc[-3:]


Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
163,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True
164,MewtwoMega Mewtwo Y,Psychic,,780,106,150,70,194,120,140,1,True
165,Mew,Psychic,,600,100,100,100,100,100,100,1,False


### .loc[ ] usage

In [43]:
# with .loc[] you can use LABELS (strings) instead of just numeric values like with .iloc[]

print(df.iloc[154, 11], "using .iloc[]")

print(df.loc[164, "Legendary"], "using .loc[]")

print("You get the same value using .iloc[] and .loc[] but the method is different")

False using .iloc[]
True using .loc[]
You get the same value using .iloc[] and .loc[] but the method is different


In [46]:
# using .loc[] to get all rows for a specific column

df.loc[:, "Name"]

0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
161                Dragonite
162                   Mewtwo
163      MewtwoMega Mewtwo X
164      MewtwoMega Mewtwo Y
165                      Mew
Name: Name, Length: 166, dtype: object

In [47]:
# can use .loc[] to get more than more than one column for x row

# Note that .loc[] IS INCLUSIVE, unlike .iloc[] which is NOT INCLUSIVE (meaning does it include the last number in the range
# id est. "inclusive". If it doesn't include last range value, it's EXCLUSIVE)
df.loc[:5,"Name": "Generation"]

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1
4,Charmander,Fire,,309,39,52,43,60,50,65,1
5,Charmeleon,Fire,,405,58,64,58,80,65,80,1


### Using .groupby()

In [51]:
# Using .groupby()

pokemon_by_type = df.groupby("Type 1")["Name"].count()
# the first selection (  ), is the column that you want to group on.
# the second selection [   ], is the column on which that you want to perform the aggregation
pokemon_by_type

Type 1
Bug         14
Dragon       3
Electric     9
Fairy        2
Fighting     7
Fire        14
Ghost        4
Grass       13
Ground       8
Ice          2
Normal      24
Poison      14
Psychic     11
Rock        10
Water       31
Name: Name, dtype: int64

In [58]:
pokemon_by_type.sort_values(ascending= False)


Type 1
Water       31
Normal      24
Bug         14
Fire        14
Poison      14
Grass       13
Psychic     11
Rock        10
Electric     9
Ground       8
Fighting     7
Ghost        4
Dragon       3
Fairy        2
Ice          2
Name: Name, dtype: int64

In [55]:
# You can use multiple columns

df.groupby(["Type 1", "Type 2"])["Name"].count().sort_values(ascending= False)

Type 1    Type 2  
Water     None        19
Normal    None        13
Grass     Poison      10
Poison    None        10
Fire      None        10
Psychic   None         9
Normal    Flying       9
Fighting  None         7
Bug       Poison       6
Electric  None         6
Ground    None         6
Water     Psychic      4
Rock      Water        4
          Ground       4
Ghost     Poison       4
Water     Ice          3
Bug       Flying       3
          None         3
Fire      Flying       3
Grass     Psychic      2
Rock      Flying       2
Electric  Steel        2
Fairy     None         2
Poison    Ground       2
          Flying       2
Normal    Fairy        2
Water     Poison       2
Bug       Grass        2
Ground    Rock         2
Dragon    None         2
Water     Dark         1
Dragon    Flying       1
Water     Flying       1
          Fighting     1
Psychic   Fighting     1
Electric  Flying       1
Grass     None         1
Psychic   Fairy        1
Fire      Dragon       1
Ice   

In [None]:
# performing 