# Learning More About Pandas
Pandas: Python package for tabular data analysis


In [None]:
import pandas as pd

In [None]:
# start with a list (Python)
start_list = [2, 4, 6, 8]
start_list

[2, 4, 6, 8]

In [None]:
#create pandas series
start_series = pd.Series(start_list)
start_series

0    2
1    4
2    6
3    8
dtype: int64

In [None]:
#get the series as an array
start_series.values

array([2, 4, 6, 8])

In [None]:
#get the indices of the series
start_series.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
#select values from series
start_series[0]

2

In [None]:
#select a range of values
print(start_series[0:3])

#get last value in the series
print(start_series[3])

#also get the last value of the series
start_series.iloc[-1]

0    2
1    4
2    6
dtype: int64
8


8

## Series as dictionaries

In [None]:
sw_chars = {'Luke Skywalker': 70,
            'R2-D2': 20,
            'C3PO': 69}
sw_chars

{'Luke Skywalker': 70, 'R2-D2': 20, 'C3PO': 69}

In [None]:
# use dictionary to initialize Series
dict_series = pd.Series(sw_chars)
dict_series

Luke Skywalker    70
R2-D2             20
C3PO              69
dtype: int64

In [None]:
# indexing as a dictionary with key-value pairs
dict_series['R2-D2']
dict_series.index

Index(['Luke Skywalker', 'R2-D2', 'C3PO'], dtype='object')

## Selection with indexers
`.iloc`, `.loc`

In [None]:
#allows lookup by index position
print(dict_series.iloc[0])
dict_series[0:2]

70


Luke Skywalker    70
R2-D2             20
dtype: int64

In [None]:
#allows lookup by keys
print(dict_series.loc['Luke Skywalker'])

# we can also do slicing with keys
dict_series.loc['Luke Skywalker':]

70


Luke Skywalker    70
R2-D2             20
C3PO              69
dtype: int64

In [None]:
#reference keys directly
dict_series.keys()

Index(['Luke Skywalker', 'R2-D2', 'C3PO'], dtype='object')

# Dataframes

In [None]:
#read in data
sw_df = pd.read_csv('https://raw.githubusercontent.com/tidyverse/dplyr/cf8031d00f406c6dc5d483d7e9e34639df797b81/data-raw/starwars.csv')

In [None]:
# get first 5 rows
sw_df.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships
0,Luke Skywalker,172.0,77.0,blond,fair,blue,19.0,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, ...","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle"
1,C-3PO,167.0,75.0,,gold,yellow,112.0,none,masculine,Tatooine,Droid,"The Empire Strikes Back, Attack of the Clones,...",,
2,R2-D2,96.0,32.0,,"white, blue",red,33.0,none,masculine,Naboo,Droid,"The Empire Strikes Back, Attack of the Clones,...",,
3,Darth Vader,202.0,136.0,none,white,yellow,41.9,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, ...",,TIE Advanced x1
4,Leia Organa,150.0,49.0,brown,light,brown,19.0,female,feminine,Alderaan,Human,"The Empire Strikes Back, Revenge of the Sith, ...",Imperial Speeder Bike,


In [None]:
# get last 5 rows
sw_df.tail()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships
82,Rey,,,brown,light,hazel,,female,feminine,,Human,The Force Awakens,,
83,Poe Dameron,,,brown,light,brown,,male,masculine,,Human,The Force Awakens,,T-70 X-wing fighter
84,BB8,,,none,none,black,,none,masculine,,Droid,The Force Awakens,,
85,Captain Phasma,,,unknown,unknown,unknown,,,,,,The Force Awakens,,
86,Padmé Amidala,165.0,45.0,brown,light,brown,46.0,female,feminine,Naboo,Human,"Attack of the Clones, The Phantom Menace, Reve...",,"H-type Nubian yacht, Naboo star skiff, Naboo f..."


In [None]:
# get information about NaNs
sw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        87 non-null     object 
 1   height      81 non-null     float64
 2   mass        59 non-null     float64
 3   hair_color  82 non-null     object 
 4   skin_color  87 non-null     object 
 5   eye_color   87 non-null     object 
 6   birth_year  43 non-null     float64
 7   sex         83 non-null     object 
 8   gender      83 non-null     object 
 9   homeworld   77 non-null     object 
 10  species     83 non-null     object 
 11  films       87 non-null     object 
 12  vehicles    11 non-null     object 
 13  starships   20 non-null     object 
dtypes: float64(3), object(11)
memory usage: 9.6+ KB


In [None]:
# get information about the shape
print(sw_df.shape)
no_rows = sw_df.shape[0]
no_cols = sw_df.shape[1]
print(no_rows, no_cols)

(87, 14)
87 14


In [None]:
sw_df.describe()

Unnamed: 0,height,mass,birth_year
count,81.0,59.0,43.0
mean,174.358025,97.311864,87.565116
std,34.770429,169.457163,154.691439
min,66.0,15.0,8.0
25%,167.0,55.6,35.0
50%,180.0,79.0,52.0
75%,191.0,84.5,72.0
max,264.0,1358.0,896.0


In [None]:
sw_df

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships
0,Luke Skywalker,172.0,77.0,blond,fair,blue,19.0,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, ...","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle"
1,C-3PO,167.0,75.0,,gold,yellow,112.0,none,masculine,Tatooine,Droid,"The Empire Strikes Back, Attack of the Clones,...",,
2,R2-D2,96.0,32.0,,"white, blue",red,33.0,none,masculine,Naboo,Droid,"The Empire Strikes Back, Attack of the Clones,...",,
3,Darth Vader,202.0,136.0,none,white,yellow,41.9,male,masculine,Tatooine,Human,"The Empire Strikes Back, Revenge of the Sith, ...",,TIE Advanced x1
4,Leia Organa,150.0,49.0,brown,light,brown,19.0,female,feminine,Alderaan,Human,"The Empire Strikes Back, Revenge of the Sith, ...",Imperial Speeder Bike,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,Rey,,,brown,light,hazel,,female,feminine,,Human,The Force Awakens,,
83,Poe Dameron,,,brown,light,brown,,male,masculine,,Human,The Force Awakens,,T-70 X-wing fighter
84,BB8,,,none,none,black,,none,masculine,,Droid,The Force Awakens,,
85,Captain Phasma,,,unknown,unknown,unknown,,,,,,The Force Awakens,,


# Grammar of Data Manipulation
## `select`
Choosing a column(s)


In [None]:
# who is the oldest character
by_col = sw_df['birth_year']
type(by_col)

pandas.core.series.Series

In [None]:
oldest_character_age = by_col.max()
oldest_character_age

896.0

In [None]:
# Count all types of hair colors
sw_df['hair_color'].value_counts()

none             37
brown            18
black            13
white             4
blond             3
brown, grey       1
auburn, white     1
auburn, grey      1
grey              1
auburn            1
blonde            1
unknown           1
Name: hair_color, dtype: int64

In [None]:
print(sw_df.iloc[0, 0])
sw_df.iloc[0,1]

Luke Skywalker


172.0

In [None]:
sw_df.columns

Index(['name', 'height', 'mass', 'hair_color', 'skin_color', 'eye_color',
       'birth_year', 'sex', 'gender', 'homeworld', 'species', 'films',
       'vehicles', 'starships'],
      dtype='object')

In [None]:
print(sw_df.loc[0, 'height'])

172.0


In [None]:
sw_df.index

RangeIndex(start=0, stop=87, step=1)

In [None]:
type(sw_df.iloc[0,:])

pandas.core.series.Series