# Pandas learning

---

In [27]:
import matplotlib.pyplot as plt
import pandas as pd

##### First learn how to **create** simple dataframes from elements' list and **save** it as a csv file.

In [28]:
names = ['Bob', 'Jessica', 'Mary', 'John', 'Mel']
births = [968, 155, 77, 578, 973]
baby_dataset = list(zip(names, births))
print('baby_dataset: ', baby_dataset)

baby_dataset:  [('Bob', 968), ('Jessica', 155), ('Mary', 77), ('John', 578), ('Mel', 973)]


In [29]:
df = pd.DataFrame(data=baby_dataset, columns=['Names', 'Births'])
df

Unnamed: 0,Names,Births
0,Bob,968
1,Jessica,155
2,Mary,77
3,John,578
4,Mel,973


In [30]:
df.to_csv('births1880.csv', index=False, header=False)
# so easy :D

---

##### Now take a differente dataframe and **read** it from csv. We can do some interesting things.

In [31]:
df = pd.read_csv('pokemon_data.csv')
df

# But we have different options
# Excel format...
# pd.read_excel('pokemon_data.xlsx')

# Or a file with data separated with different characters e.g. \t
# pd.read_csv('pokemon_data.csv', delimiter='\t') 


Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,80,160,60,170,130,80,6,True


##### Use **.head()** and **.tail()** functions.
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.tail.html

In [32]:
df.head(2)


Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False


In [33]:
df.tail(2)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
798,720,HoopaHoopa Unbound,Psychic,Dark,80,160,60,170,130,80,6,True
799,721,Volcanion,Fire,Water,80,110,120,130,90,70,6,True


---

# Reading data in DataFrames.

In [34]:
print(df.columns, '\n type: ', type(df.columns))

Index(['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object') 
 type:  <class 'pandas.core.indexes.base.Index'>


In [35]:
# Select an specific column
print(df['Name'], '\n', type(df['Name']))

0                  Bulbasaur
1                    Ivysaur
2                   Venusaur
3      VenusaurMega Venusaur
4                 Charmander
               ...          
795                  Diancie
796      DiancieMega Diancie
797      HoopaHoopa Confined
798       HoopaHoopa Unbound
799                Volcanion
Name: Name, Length: 800, dtype: object 
 <class 'pandas.core.series.Series'>


In [36]:
# Due to df['Any column'] returns Series object I can use .tail() function.
df['Name'].tail(2)

798    HoopaHoopa Unbound
799             Volcanion
Name: Name, dtype: object

In [37]:
# Or use python list selector.
df['Name'][0:5]

0                Bulbasaur
1                  Ivysaur
2                 Venusaur
3    VenusaurMega Venusaur
4               Charmander
Name: Name, dtype: object

In [38]:
# And use dot selector, to select a column.
df.Name[0:5]

0                Bulbasaur
1                  Ivysaur
2                 Venusaur
3    VenusaurMega Venusaur
4               Charmander
Name: Name, dtype: object

In [39]:
# You can select several columns like this.
df[['Name', 'HP', 'Type 1']]

Unnamed: 0,Name,HP,Type 1
0,Bulbasaur,45,Grass
1,Ivysaur,60,Grass
2,Venusaur,80,Grass
3,VenusaurMega Venusaur,80,Grass
4,Charmander,39,Fire
...,...,...,...
795,Diancie,50,Rock
796,DiancieMega Diancie,50,Rock
797,HoopaHoopa Confined,80,Psychic
798,HoopaHoopa Unbound,80,Psychic


##### If you want to read an specific row use **.iloc** function.
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html

In [40]:
# n.b. It returns a Series object
print(df.iloc[0], '\n', type(df.iloc[0]))

#                     1
Name          Bulbasaur
Type 1            Grass
Type 2           Poison
HP                   45
Attack               49
Defense              49
Sp. Atk              65
Sp. Def              65
Speed                45
Generation            1
Legendary         False
Name: 0, dtype: object 
 <class 'pandas.core.series.Series'>


In [41]:
# But here it returns a Dataframe.
df.iloc[0:3]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False


In [42]:
# And make awesome things ;)
df.iloc[0:3, 1:5]
# It means rows 0 to 2 and columns 1,2,3,4

Unnamed: 0,Name,Type 1,Type 2,HP
0,Bulbasaur,Grass,Poison,45
1,Ivysaur,Grass,Poison,60
2,Venusaur,Grass,Poison,80


In [43]:
# Or use .loc, but it can do a lot of things. So check
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
df.loc[[0, 1]]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False


##### You can get an specific cell by several ways...
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html#pandas.DataFrame.at

In [44]:
# It corresponds to row 1 column HP
df.iloc[1, 4]


60

In [45]:
# And here too, but selecting it with column name.
df.at[1, 'HP']

60

In [46]:
# You can also select rows iterating over those.
for index, row in df[0:5].iterrows():
    print(index, row['Name'])


0 Bulbasaur
1 Ivysaur
2 Venusaur
3 VenusaurMega Venusaur
4 Charmander


In [47]:
# And something great is to select rows based on conditions
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html
df.loc[df['Type 1'] == 'Fire'][10:15]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
84,78,Rapidash,Fire,,65,100,70,80,80,105,1,False
135,126,Magmar,Fire,,65,95,57,100,85,93,1,False
147,136,Flareon,Fire,,65,130,60,95,110,65,1,False
158,146,Moltres,Fire,Flying,90,100,90,125,85,90,1,True
169,155,Cyndaquil,Fire,,39,52,43,60,50,65,2,False


### Sorting

In [48]:
df.sort_values(['Name', 'HP'], ascending=[True, False])

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
510,460,Abomasnow,Grass,Ice,90,92,75,92,85,60,4,False
511,460,AbomasnowMega Abomasnow,Grass,Ice,90,132,105,132,105,30,4,False
68,63,Abra,Psychic,,25,20,15,105,55,90,1,False
392,359,Absol,Dark,,65,130,60,75,60,75,3,False
393,359,AbsolMega Absol,Dark,,65,150,60,115,60,115,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...
632,571,Zoroark,Dark,,60,105,60,120,60,105,5,False
631,570,Zorua,Dark,,40,65,40,80,40,65,5,False
46,41,Zubat,Poison,Flying,40,45,35,30,40,55,1,False
695,634,Zweilous,Dark,Dragon,72,85,70,65,70,58,5,False


### A little description of the data

In [49]:
# Useful information about our data
df.describe()

Unnamed: 0,#,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0
