Importing Pandas into Python and Reading Data (CSV)

In [1]:
import pandas as pd
# I will use 'df' to reference the dataset
df = pd.read_csv('pokemon_data.csv')

<font size="6">Reading the Data</font>

<font size="5">Headers</font>

In [3]:
df.columns

Index(['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

<font size="5">Printing out the first couple of rows</font>

In [2]:
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


In [5]:
## You can also pick the amount of rows
df.head(10)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,58,64,58,80,65,80,1,False
6,6,Charizard,Fire,Flying,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,78,104,78,159,115,100,1,False
9,7,Squirtle,Water,,44,48,65,50,64,43,1,False


<font size="5">Read specific columns</font>

In [4]:
df['Name'].head(5)

0                Bulbasaur
1                  Ivysaur
2                 Venusaur
3    VenusaurMega Venusaur
4               Charmander
Name: Name, dtype: object

In [7]:
## If you want to get a specific indexes, you can add [?:?]
df['Name'][0:10]

0                    Bulbasaur
1                      Ivysaur
2                     Venusaur
3        VenusaurMega Venusaur
4                   Charmander
5                   Charmeleon
6                    Charizard
7    CharizardMega Charizard X
8    CharizardMega Charizard Y
9                     Squirtle
Name: Name, dtype: object

<font size="5">If you want to get more than one column, you can use double brackets</font>

In [5]:
## The double brackets before and after allow you to pcik as many columns as you want in any order
df[['Name', 'Type 1', 'Generation']].head(5)

Unnamed: 0,Name,Type 1,Generation
0,Bulbasaur,Grass,1
1,Ivysaur,Grass,1
2,Venusaur,Grass,1
3,VenusaurMega Venusaur,Grass,1
4,Charmander,Fire,1


In [6]:
## Same as above, just a different order
df[['Name', 'Generation', 'Type 1']].head(5)

Unnamed: 0,Name,Generation,Type 1
0,Bulbasaur,1,Grass
1,Ivysaur,1,Grass
2,Venusaur,1,Grass
3,VenusaurMega Venusaur,1,Grass
4,Charmander,1,Fire


<font size="5">You can still grab specific indexes even if you add extra columns</font>

In [10]:
df[['Name', 'Type 1', 'Generation']][5:13]

Unnamed: 0,Name,Type 1,Generation
5,Charmeleon,Fire,1
6,Charizard,Fire,1
7,CharizardMega Charizard X,Fire,1
8,CharizardMega Charizard Y,Fire,1
9,Squirtle,Water,1
10,Wartortle,Water,1
11,Blastoise,Water,1
12,BlastoiseMega Blastoise,Water,1


<font size="5">Read each row</font>

In [11]:
df.iloc[3]

#                                 3
Name          VenusaurMega Venusaur
Type 1                        Grass
Type 2                       Poison
HP                               80
Attack                          100
Defense                         123
Sp. Atk                         122
Sp. Def                         120
Speed                            80
Generation                        1
Legendary                     False
Name: 3, dtype: object

In [12]:
## You can pick more than one row to read as well
df.iloc[3:5]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


In [7]:
## The below code lets you iterrate through rows using a for loop. Apparently it is very useful.
for index, row in df.iterrows():
    print(index, row['Name'])

In [8]:
## This works the same as above but it does not look very good (and doesn't seem useful really)
for index, row in df.iterrows():
    print(index, row[['Name', 'Generation']])

<font size="5">Getting specifics from the rows</font>

In [10]:
## Using this allows you to print out entire rows from the data that is specific to what you need
df.loc[df['Type 1'] == 'Water'].head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
9,7,Squirtle,Water,,44,48,65,50,64,43,1,False
10,8,Wartortle,Water,,59,63,80,65,80,58,1,False
11,9,Blastoise,Water,,79,83,100,85,105,78,1,False
12,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,1,False
59,54,Psyduck,Water,,50,52,48,65,50,55,1,False


In [16]:
## Integars work as well
df.loc[df['HP'] == 1]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
316,292,Shedinja,Bug,Ghost,1,90,45,30,30,40,3,False


In [17]:
## You can also get multiple conditions
df.loc[(df['Type 1'] == 'Water') & (df['Generation'] == 3)]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
280,258,Mudkip,Water,,50,70,50,50,50,40,3,False
281,259,Marshtomp,Water,Ground,70,85,70,60,70,50,3,False
282,260,Swampert,Water,Ground,100,110,90,85,90,60,3,False
283,260,SwampertMega Swampert,Water,Ground,100,150,110,95,110,70,3,False
293,270,Lotad,Water,Grass,40,30,30,40,50,30,3,False
294,271,Lombre,Water,Grass,60,50,50,60,70,50,3,False
295,272,Ludicolo,Water,Grass,80,70,70,90,100,70,3,False
301,278,Wingull,Water,Flying,40,30,30,55,30,85,3,False
302,279,Pelipper,Water,Flying,60,50,100,85,70,65,3,False
347,318,Carvanha,Water,Dark,45,90,20,65,20,65,3,False


<font size="6">Sorting/Describing Data</font>

<font size="5">Describing the data</font>

In [18]:
df.describe()

Unnamed: 0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


<font size="5">Sorting the data</font>

In [11]:
## To sort by alphabetical order, use the sort_values() function and pass in the column that you want to sort
df.sort_values('Name').head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
510,460,Abomasnow,Grass,Ice,90,92,75,92,85,60,4,False
511,460,AbomasnowMega Abomasnow,Grass,Ice,90,132,105,132,105,30,4,False
68,63,Abra,Psychic,,25,20,15,105,55,90,1,False
392,359,Absol,Dark,,65,130,60,75,60,75,3,False
393,359,AbsolMega Absol,Dark,,65,150,60,115,60,115,3,False


In [12]:
## To sort in reverese alphabeticl order, pass in the following
df.sort_values('Name', ascending=False).head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
794,718,Zygarde50% Forme,Dragon,Ground,108,100,121,81,95,95,6,True
695,634,Zweilous,Dark,Dragon,72,85,70,65,70,58,5,False
46,41,Zubat,Poison,Flying,40,45,35,30,40,55,1,False
631,570,Zorua,Dark,,40,65,40,80,40,65,5,False
632,571,Zoroark,Dark,,60,105,60,120,60,105,5,False


In [13]:
## You can also use pass in multiple columns. This will sort in order of columns passed
df.sort_values(['Type 1', 'Attack']).head(10)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
230,213,Shuckle,Bug,Rock,20,10,230,10,230,5,2,False
14,11,Metapod,Bug,,50,20,55,25,25,30,1,False
179,165,Ledyba,Bug,Flying,40,20,30,40,80,55,2,False
733,665,Spewpa,Bug,,45,22,60,27,30,29,6,False
17,14,Kakuna,Bug,Poison,45,25,50,25,25,35,1,False
446,401,Kricketot,Bug,,37,25,41,25,41,25,4,False
457,412,Burmy,Bug,,40,29,45,29,45,36,4,False
13,10,Caterpie,Bug,,45,30,35,20,20,45,1,False
307,283,Surskit,Bug,Water,40,30,32,50,52,65,3,False
462,415,Combee,Bug,Flying,30,30,42,30,42,70,4,False


In [14]:
## You can specify which columns you want ascending and decending
## 1 obviously means True while 0 means False
df.sort_values(['Name', 'Attack'], ascending=[1,0]).head(10)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
510,460,Abomasnow,Grass,Ice,90,92,75,92,85,60,4,False
511,460,AbomasnowMega Abomasnow,Grass,Ice,90,132,105,132,105,30,4,False
68,63,Abra,Psychic,,25,20,15,105,55,90,1,False
392,359,Absol,Dark,,65,130,60,75,60,75,3,False
393,359,AbsolMega Absol,Dark,,65,150,60,115,60,115,3,False
678,617,Accelgor,Bug,,80,70,40,100,60,145,5,False
750,681,AegislashBlade Forme,Steel,Ghost,60,150,50,150,50,60,6,False
751,681,AegislashShield Forme,Steel,Ghost,60,50,150,50,150,60,6,False
153,142,Aerodactyl,Rock,Flying,80,105,65,60,75,130,1,False
154,142,AerodactylMega Aerodactyl,Rock,Flying,80,135,85,70,95,150,1,False


<font size="6">Making Changes to the Data</font>

<font size="5">Adding a column</font>

In [15]:
## Let's say you want to add all the values in you data, this is how
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']

## Now we print it
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False,318
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False,405
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False,525
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,625
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False,309


<font size="5">Dropping a column</font>

In [24]:
## You can remove a column with the drop function
## In order for this to work, you are going to need to set it to a varible (which could be the same)
df = df.drop(columns=['Total'])

## Print to show it is gone
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


<font size="5">Moving a column</font>

In [25]:
## Adding it back the so we can move it
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']

df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False,318
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False,405
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False,525
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,625
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False,309


In [26]:
## Making a new varible (with the same name) allows you to order the columns as ou please
df = df[['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total', 'Generation', 'Legendary']]

df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,318,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,405,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,525,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,False


<font size="6">Saving Data (Exporting)</font>

<font size="5">To CSV File</font>

In [27]:
## If it takes you to the next cell, that means it should have worked
df.to_csv('modified.csv')

In [28]:
## If you don't want to the indexes to be in the data, pass in the index=False argument
df.to_csv('modified.csv', index=False)

<font size="5">To Excel File</font>

In [16]:
## This will save the file to an Excel file
df.to_excel('modified.xlsx', index=False)

<font size="5">To TXT File</font>

In [None]:
## This will save the file to a text file
df.to_txt('modified.csv', index=False, sep='\t')

<font size="6">Filtering Data</font>

<font size="5">"And" sign</font>

In [None]:
## As shown earlier, you can get mulitple conditions when filtering data
df.loc[(df['Type 1'] == 'Grass') & (df['Legendary'] == 1)]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
550,492,ShayminLand Forme,Grass,,100,100,100,100,100,100,600,4,True
551,492,ShayminSky Forme,Grass,Flying,100,103,75,120,75,127,600,4,True
701,640,Virizion,Grass,Fighting,91,90,72,90,129,108,580,5,True


In [None]:
## Another example
df.loc[(df['Type 1'] == 'Fire') & (df['HP'] > 100) & (df['Speed'] < 75)]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
559,500,Emboar,Fire,Fighting,110,123,65,100,65,65,528,5,False
616,555,DarmanitanZen Mode,Fire,Psychic,105,30,105,140,105,55,540,5,False


<font size="5">"Or" sign</font>

In [17]:
## You can also us | as the "or" sign
df.loc[(df['Speed'] > 105) | (df['Defense'] > 100)].head(15)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,625
7,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,1,False,634
12,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,1,False,630
19,15,BeedrillMega Beedrill,Bug,Poison,65,150,40,15,80,145,1,False,495
23,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,1,False,579
31,26,Raichu,Electric,,60,90,55,90,80,110,1,False,485
33,28,Sandslash,Ground,,75,100,110,45,55,65,1,False,450
56,51,Dugtrio,Ground,,35,80,50,50,70,120,1,False,405
58,53,Persian,Normal,,65,70,60,65,65,115,1,False,440
70,65,Alakazam,Psychic,,55,50,45,135,95,120,1,False,500


<font size="5">Creating New Dataframe</font>

In [None]:
## You can simply create a new variable which will give you a new df
## Using the above table to create a new df
new_df = df.loc[(df['Speed'] > 105) | (df['Defense'] > 100)]

new_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,634,1,False
12,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,630,1,False
19,15,BeedrillMega Beedrill,Bug,Poison,65,150,40,15,80,145,495,1,False
23,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,579,1,False


In [None]:
## You can also save the new df to a csv at any point

#new_df.to_csv('filtered.csv')

<font size="5">Reseting Indexes</font>

In [None]:
## As you can see the new df kept the old indexes. We can reset them with this function

new_df = new_df.reset_index()

## Printing 
new_df.head()

Unnamed: 0,index,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
1,7,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,634,1,False
2,12,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,630,1,False
3,19,15,BeedrillMega Beedrill,Bug,Poison,65,150,40,15,80,145,495,1,False
4,23,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,579,1,False


In [None]:
## It will save the old index as a new column but as shown earlier, we can get rid of it if we dont want it

new_df = new_df.drop(columns=['index'])

## Printing
new_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
1,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,634,1,False
2,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,630,1,False
3,15,BeedrillMega Beedrill,Bug,Poison,65,150,40,15,80,145,495,1,False
4,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,579,1,False


In [None]:
## You can also do within the function itself. Let's see how

## Bringing up df
new_df = df.loc[(df['Speed'] > 105) | (df['Defense'] > 100)]

## Using new arguments in function
new_df.reset_index(drop=True, inplace=True)

# The "drop" argument removes the index column
# The "inplace" argument allows you to not create a new variable

## Printing
new_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
1,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,634,1,False
2,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,630,1,False
3,15,BeedrillMega Beedrill,Bug,Poison,65,150,40,15,80,145,495,1,False
4,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,579,1,False


<font size="5">Name Conditions</font>

In [19]:
## You can filter names as well
## Showing the first 10 rows
df.loc[df['Name'].str.contains('Mega')]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,625
7,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,1,False,634
8,6,CharizardMega Charizard Y,Fire,Flying,78,104,78,159,115,100,1,False,634
12,9,BlastoiseMega Blastoise,Water,,79,103,120,135,115,78,1,False,630
19,15,BeedrillMega Beedrill,Bug,Poison,65,150,40,15,80,145,1,False,495
23,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,1,False,579
71,65,AlakazamMega Alakazam,Psychic,,55,50,65,175,95,150,1,False,590
87,80,SlowbroMega Slowbro,Water,Psychic,95,75,180,130,80,30,1,False,590
102,94,GengarMega Gengar,Ghost,Poison,60,65,80,170,95,130,1,False,600
124,115,KangaskhanMega Kangaskhan,Normal,,105,125,100,60,100,100,1,False,590


In [20]:
## To get the reverse of this (rows that do not contain 'Mega'), you add a squigly line
df.loc[~df['Name'].str.contains('Mega')]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False,318
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False,405
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False,525
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False,309
5,5,Charmeleon,Fire,,58,64,58,80,65,80,1,False,405
...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,718,Zygarde50% Forme,Dragon,Ground,108,100,121,81,95,95,6,True,600
795,719,Diancie,Rock,Fairy,50,100,150,100,150,50,6,True,600
797,720,HoopaHoopa Confined,Psychic,Ghost,80,110,60,150,130,70,6,True,600
798,720,HoopaHoopa Unbound,Psychic,Dark,80,160,60,170,130,80,6,True,680


In [40]:
## Going to import Regulary Expressions Library to find certain texual patterns
import re
df.loc[df['Type 1'].str.contains('Fire|Grass', regex=True)]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,318,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,405,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,525,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,667,Litleo,Fire,Normal,62,50,58,73,54,72,369,6,False
736,668,Pyroar,Fire,Normal,86,68,72,109,66,106,507,6,False
740,672,Skiddo,Grass,,66,65,48,62,57,52,350,6,False
741,673,Gogoat,Grass,,123,100,62,97,81,68,531,6,False


In [41]:
## You can ignore capitalization by passing in a flag argument
df.loc[df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,318,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,405,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,525,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,667,Litleo,Fire,Normal,62,50,58,73,54,72,369,6,False
736,668,Pyroar,Fire,Normal,86,68,72,109,66,106,507,6,False
740,672,Skiddo,Grass,,66,65,48,62,57,52,350,6,False
741,673,Gogoat,Grass,,123,100,62,97,81,68,531,6,False


In [42]:
## In this example, it is saying to find rows in the name column that start with 'pi' (thats what the ^ is for), then the next letter can be anywhere from a-z, then the * means it can be 0 characters or more
df.loc[df['Name'].str.contains('^pi[a-z]*', flags=re.I, regex=True)]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
20,16,Pidgey,Normal,Flying,40,45,40,35,35,56,251,1,False
21,17,Pidgeotto,Normal,Flying,63,60,55,50,50,71,349,1,False
22,18,Pidgeot,Normal,Flying,83,80,75,70,70,101,479,1,False
23,18,PidgeotMega Pidgeot,Normal,Flying,83,80,80,135,80,121,579,1,False
30,25,Pikachu,Electric,,35,55,40,50,50,90,320,1,False
136,127,Pinsir,Bug,,65,125,100,55,70,85,500,1,False
137,127,PinsirMega Pinsir,Bug,Flying,65,155,120,65,90,105,600,1,False
186,172,Pichu,Electric,,20,40,15,35,35,60,205,2,False
219,204,Pineco,Bug,,50,65,90,35,35,15,290,2,False
239,221,Piloswine,Ice,Ground,100,100,80,60,60,50,450,2,False


<font size="5">Conditional Changes</font>

In [48]:
## Changing the name of certain attributes in the data
df.loc[df['Type 1'] == 'Fire', 'Type 1'] = 'Flamer'

## Printing an example of a fire type pokemon
df.loc[df['Name'] == 'Charmander']

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
4,4,Charmander,Flamer,,39,52,43,60,50,65,309,1,False


In [49]:
## Another example
df.loc[df['Type 1'] == 'Grass', 'Type 1'] = 'Leaf'

## Printing
df.head(5)

## As you can see, all of the grass type pokemon are now labeled as 'Leaf' in the Type 1 column

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Leaf,Poison,45,49,49,65,65,45,318,1,False
1,2,Ivysaur,Leaf,Poison,60,62,63,80,80,60,405,1,False
2,3,Venusaur,Leaf,Poison,80,82,83,100,100,80,525,1,False
3,3,VenusaurMega Venusaur,Leaf,Poison,80,100,123,122,120,80,625,1,False
4,4,Charmander,Flamer,,39,52,43,60,50,65,309,1,False


In [50]:
## Changing back to the original df
df.loc[df['Type 1'] == 'Flamer', 'Type 1'] = 'Fire'
df.loc[df['Type 1'] == 'Leaf', 'Type 1'] = 'Grass'

## Printing
df

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,318,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,405,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,525,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,50,100,150,100,150,50,600,6,True
796,719,DiancieMega Diancie,Rock,Fairy,50,160,110,160,110,110,700,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,80,110,60,150,130,70,600,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,80,160,60,170,130,80,680,6,True


In [51]:
## You can also change a column while referencing another
## For example, let's make all of the fire type pokemon legendary
df.loc[df['Type 1'] == 'Fire', 'Legendary'] = True

## Printing all fire type pokemon
df.loc[df['Type 1'] == 'Fire']

## As you'll see, they are now all legendary

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,True
5,5,Charmeleon,Fire,,58,64,58,80,65,80,405,1,True
6,6,Charizard,Fire,Flying,78,84,78,109,85,100,534,1,True
7,6,CharizardMega Charizard X,Fire,Dragon,78,130,111,130,85,100,634,1,True
8,6,CharizardMega Charizard Y,Fire,Flying,78,104,78,159,115,100,634,1,True
42,37,Vulpix,Fire,,38,41,40,50,65,65,299,1,True
43,38,Ninetales,Fire,,73,76,75,81,100,100,505,1,True
63,58,Growlithe,Fire,,55,70,45,70,50,60,350,1,True
64,59,Arcanine,Fire,,90,110,80,100,80,95,555,1,True
83,77,Ponyta,Fire,,50,85,55,65,65,90,410,1,True


<font size="5">Aggregate Statistics (Group By)</font>

In [62]:
## Let's get the original df back with the Total column
new_df = pd.read_csv('pokemon_data.csv')
new_df['Total'] = new_df['HP'] + new_df['Attack'] + new_df['Defense'] + new_df['Sp. Atk'] + new_df['Sp. Def'] + new_df['Speed']
new_df = new_df[['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total', 'Generation', 'Legendary']]


## Printing to check
new_df

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,318,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,405,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,525,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,625,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,309,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,50,100,150,100,150,50,600,6,True
796,719,DiancieMega Diancie,Rock,Fairy,50,160,110,160,110,110,700,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,80,110,60,150,130,70,600,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,80,160,60,170,130,80,680,6,True


In [63]:
## Let's say you wanted to see the average of all the columns by each type of pokemon. Here's how
new_df.groupby(['Type 1']).mean()

  new_df.groupby(['Type 1']).mean()


Unnamed: 0_level_0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bug,334.492754,56.884058,70.971014,70.724638,53.869565,64.797101,61.681159,378.927536,3.217391,0.0
Dark,461.354839,66.806452,88.387097,70.225806,74.645161,69.516129,76.16129,445.741935,4.032258,0.064516
Dragon,474.375,83.3125,112.125,86.375,96.84375,88.84375,83.03125,550.53125,3.875,0.375
Electric,363.5,59.795455,69.090909,66.295455,90.022727,73.704545,84.5,443.409091,3.272727,0.090909
Fairy,449.529412,74.117647,61.529412,65.705882,78.529412,84.705882,48.588235,413.176471,4.117647,0.058824
Fighting,363.851852,69.851852,96.777778,65.925926,53.111111,64.703704,66.074074,416.444444,3.37037,0.0
Fire,327.403846,69.903846,84.769231,67.769231,88.980769,72.211538,74.442308,458.076923,3.211538,0.096154
Flying,677.75,70.75,78.75,66.25,94.25,72.5,102.5,485.0,5.5,0.5
Ghost,486.5,64.4375,73.78125,81.1875,79.34375,76.46875,64.34375,439.5625,4.1875,0.0625
Grass,344.871429,67.271429,73.214286,70.8,77.5,70.428571,61.928571,421.142857,3.357143,0.042857


In [65]:
## Now we can sort it using the sort function we learned earlier
new_df.groupby(['Type 1']).mean().sort_values('Defense')

  new_df.groupby(['Type 1']).mean().sort_values('Defense')


Unnamed: 0_level_0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Normal,319.173469,77.27551,73.469388,59.846939,55.816327,63.72449,71.55102,401.683673,3.05102,0.020408
Fairy,449.529412,74.117647,61.529412,65.705882,78.529412,84.705882,48.588235,413.176471,4.117647,0.058824
Fighting,363.851852,69.851852,96.777778,65.925926,53.111111,64.703704,66.074074,416.444444,3.37037,0.0
Flying,677.75,70.75,78.75,66.25,94.25,72.5,102.5,485.0,5.5,0.5
Electric,363.5,59.795455,69.090909,66.295455,90.022727,73.704545,84.5,443.409091,3.272727,0.090909
Psychic,380.807018,70.631579,71.45614,67.684211,98.403509,86.280702,81.491228,475.947368,3.385965,0.245614
Fire,327.403846,69.903846,84.769231,67.769231,88.980769,72.211538,74.442308,458.076923,3.211538,0.096154
Poison,251.785714,67.25,74.678571,68.821429,60.428571,64.392857,63.571429,399.142857,2.535714,0.0
Dark,461.354839,66.806452,88.387097,70.225806,74.645161,69.516129,76.16129,445.741935,4.032258,0.064516
Bug,334.492754,56.884058,70.971014,70.724638,53.869565,64.797101,61.681159,378.927536,3.217391,0.0


In [67]:
## We can flip it as well
new_df.groupby(['Type 1']).mean().sort_values('Defense', ascending=False)

  new_df.groupby(['Type 1']).mean().sort_values('Defense', ascending=False)


Unnamed: 0_level_0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Steel,442.851852,65.222222,92.703704,126.37037,67.518519,80.62963,55.259259,487.703704,3.851852,0.148148
Rock,392.727273,65.363636,92.863636,100.795455,63.340909,75.477273,55.909091,453.75,3.454545,0.090909
Dragon,474.375,83.3125,112.125,86.375,96.84375,88.84375,83.03125,550.53125,3.875,0.375
Ground,356.28125,73.78125,95.75,84.84375,56.46875,62.75,63.90625,437.5,3.15625,0.125
Ghost,486.5,64.4375,73.78125,81.1875,79.34375,76.46875,64.34375,439.5625,4.1875,0.0625
Water,303.089286,72.0625,74.151786,72.946429,74.8125,70.517857,65.964286,430.455357,2.857143,0.035714
Ice,423.541667,72.0,72.75,71.416667,77.541667,76.291667,63.458333,433.458333,3.541667,0.083333
Grass,344.871429,67.271429,73.214286,70.8,77.5,70.428571,61.928571,421.142857,3.357143,0.042857
Bug,334.492754,56.884058,70.971014,70.724638,53.869565,64.797101,61.681159,378.927536,3.217391,0.0
Dark,461.354839,66.806452,88.387097,70.225806,74.645161,69.516129,76.16129,445.741935,4.032258,0.064516


In [69]:
## Another example
new_df.groupby(['Type 1']).mean().sort_values('Attack', ascending=False)

  new_df.groupby(['Type 1']).mean().sort_values('Attack', ascending=False)


Unnamed: 0_level_0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Total,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Dragon,474.375,83.3125,112.125,86.375,96.84375,88.84375,83.03125,550.53125,3.875,0.375
Fighting,363.851852,69.851852,96.777778,65.925926,53.111111,64.703704,66.074074,416.444444,3.37037,0.0
Ground,356.28125,73.78125,95.75,84.84375,56.46875,62.75,63.90625,437.5,3.15625,0.125
Rock,392.727273,65.363636,92.863636,100.795455,63.340909,75.477273,55.909091,453.75,3.454545,0.090909
Steel,442.851852,65.222222,92.703704,126.37037,67.518519,80.62963,55.259259,487.703704,3.851852,0.148148
Dark,461.354839,66.806452,88.387097,70.225806,74.645161,69.516129,76.16129,445.741935,4.032258,0.064516
Fire,327.403846,69.903846,84.769231,67.769231,88.980769,72.211538,74.442308,458.076923,3.211538,0.096154
Flying,677.75,70.75,78.75,66.25,94.25,72.5,102.5,485.0,5.5,0.5
Poison,251.785714,67.25,74.678571,68.821429,60.428571,64.392857,63.571429,399.142857,2.535714,0.0
Water,303.089286,72.0625,74.151786,72.946429,74.8125,70.517857,65.964286,430.455357,2.857143,0.035714


In [71]:
## You can also count how many of something is in each column
## First, we'll add a new count column
new_df['count'] = 1

## Next, we'll use the count function to count how many there are in each column
new_df.groupby(['Type 1']).count()['count']

## This will give you the number of each type 1 there are in the df

Type 1
Bug          69
Dark         31
Dragon       32
Electric     44
Fairy        17
Fighting     27
Fire         52
Flying        4
Ghost        32
Grass        70
Ground       32
Ice          24
Normal       98
Poison       28
Psychic      57
Rock         44
Steel        27
Water       112
Name: count, dtype: int64

In [72]:
## You are also able to group by multiple parameters
new_df.groupby(['Type 1', 'Type 2']).count()['count']

Type 1  Type 2  
Bug     Electric     2
        Fighting     2
        Fire         2
        Flying      14
        Ghost        1
                    ..
Water   Ice          3
        Poison       3
        Psychic      5
        Rock         4
        Steel        1
Name: count, Length: 136, dtype: int64

<font size="6">Working with large amounts of Data</font>

<font size="5">Loading in Chunks of Data at a time</font>

In [21]:
## Here's how to load in a certain amount of rows at a time (this example uses 5 rows)
for df in pd.read_csv('pokemon_data.csv', chunksize=5):
    print('CHUNK')
    print(df)