# Pandas.

Pandas is build-in on top of numpy

In [4]:
import pandas as pd
import numpy as np

## Series

In [4]:
# Creating a series
# The index by default start in 0 until N-1, where N is the number of elements
s = pd.Series([1,2,3,4])

0    1
1    2
2    3
3    4
dtype: int64

In [5]:
# Array representation
s.values

array([1, 2, 3, 4])

In [6]:
# Index representation
s.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
# Modify the index
s2 = pd.Series([1,2,3,4], index = ['a', 'b', 'c', 'd'])
s2

a    1
b    2
c    3
d    4
dtype: int64

In [9]:
# Selecting a value using the index
s2['a'] = 0
s2

a    0
b    2
c    3
d    4
dtype: int64

In [10]:
# Selecting a set of values using the index
s2[['c', 'd']] = [2,3]
s2

a    0
b    2
c    2
d    3
dtype: int64

In [11]:
# Boolean
s2[s2 > 0]

b    2
c    2
d    3
dtype: int64

In [12]:
# Multiplication
s2*2

a    0
b    4
c    4
d    6
dtype: int64

In [13]:
# Math funtions
import numpy as np
# exponecial
np.exp(s2)

a     1.000000
b     7.389056
c     7.389056
d    20.085537
dtype: float64

Dict to Series

In [18]:
# Create a dict
data = {'Bob': 22, 'Alice': 21, 'John': 30, 'Bill': 27}
# Creting a Series using a dict
s3 = pd.Series(data)
s3

Bob      22
Alice    21
John     30
Bill     27
dtype: int64

In [19]:
# Using a list as index
names = ['Bob', 'John', 'Alice', 'Mike']
s4 = pd.Series(data, index=names)
s4

Bob      22.0
John     30.0
Alice    21.0
Mike      NaN
dtype: float64

In [20]:
# For Mike the value will be NaN because doesnt exist on the data 
# Looking for missing values
missing_val = pd.isnull(s4)
missing_val

Bob      False
John     False
Alice    False
Mike      True
dtype: bool

In [21]:
# In other way we can look for the not missing values
not_null = pd.notnull(s4)
not_null

Bob       True
John      True
Alice     True
Mike     False
dtype: bool

# DataFrame

In [104]:
# Data in dict
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
# Creating a DataFrame usinf data
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [105]:
# Reorder the columns in df
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [106]:
# Adding a customized index
frame2 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop'],
                      index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop
one,2000,Ohio,1.5
two,2001,Ohio,1.7
three,2002,Ohio,3.6
four,2001,Nevada,2.4
five,2002,Nevada,2.9


In [107]:
# Adding a new column
# if will not have data for column it will be NaN
frame3 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five'])
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [108]:
# Get a column from the df
frame3['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [109]:
# Another way to get a column 
frame3.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [110]:
# Find the first elemenet odf the df
frame3.iloc[0]

year     2000
state    Ohio
pop       1.5
debt      NaN
Name: one, dtype: object

In [111]:
# Find the last element of the df
frame3.iloc[-1]

year       2002
state    Nevada
pop         2.9
debt        NaN
Name: five, dtype: object

In [112]:
# Find the all element except the first
frame3.iloc[1::]

Unnamed: 0,year,state,pop,debt
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [113]:
# Assingn a value or an array for a column - "debt"
frame3['debt'] = 99
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,99
two,2001,Ohio,1.7,99
three,2002,Ohio,3.6,99
four,2001,Nevada,2.4,99
five,2002,Nevada,2.9,99


In [114]:
# Assingn an array
frame3['debt'] = np.random.randint(1000,5000,5)
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,3867
two,2001,Ohio,1.7,4230
three,2002,Ohio,3.6,3067
four,2001,Nevada,2.4,4433
five,2002,Nevada,2.9,4699


In [115]:
# Inserting the values of a Series on the specific indexes
vals = pd.Series([0, 2000], index=['four', 'five'])
frame3['debt'] = vals
frame3

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,0.0
five,2002,Nevada,2.9,2000.0


In [116]:
# Bool and del
frame3['hightDebt'] = frame3.debt > 0
frame3

Unnamed: 0,year,state,pop,debt,hightDebt
one,2000,Ohio,1.5,,False
two,2001,Ohio,1.7,,False
three,2002,Ohio,3.6,,False
four,2001,Nevada,2.4,0.0,False
five,2002,Nevada,2.9,2000.0,True


In [121]:
frame4 = frame3.drop(columns = ['hightDebt'])
frame4

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,0.0
five,2002,Nevada,2.9,2000.0


## Index Objects

In [13]:
# Set the index of the Series
index = pd.Index(['One', 'Two', 'Four'])
# Create a Series withe index equal to index
obj = pd.Series([10, 20, 30], index=index)
# Verify if the name of index is index
obj.index is index

True

In [18]:
# Verify if a index 'One' is on the list of indexs
'One' in obj.index # True
'Four' in obj.index # False

False

In [22]:
# Reindex
obj.reindex(['One', 'Two', 'Three', 'Four'], fill_value=0) # fill_value = missing value to add

One      10
Two      20
Three    30
Four      0
dtype: int64

In [41]:
# New DataFRame with index and colums
index = ['row1', 'row2', 'row3', 'row4']
data = pd.DataFrame(np.arange(16).reshape(4,4), index=index, columns=['col1', 'col2', 'col3', 'col4'])
data

Unnamed: 0,col1,col2,col3,col4
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [44]:
# Drop a row, axis=0
new_obj = data.drop(['row1'], axis=0)
new_obj

Unnamed: 0,col1,col2,col3,col4
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [45]:
# Drop a column, axis=1
new_obj = data.drop(['col3'], axis=1)
new_obj

Unnamed: 0,col1,col2,col4
row1,0,1,3
row2,4,5,7
row3,8,9,11
row4,12,13,15


## Indexing, selection and filtering

In [None]:
# Indexing a Series
obj3 = pd.Series(np.arange(4), index=index)
obj3['row1'] # 0
obj3['row2'] # 1
obj3[1:3] 

In [66]:
# Filtering
new_obj = obj3[obj3 > 1]
new_obj


row3    2
row4    3
dtype: int64

## Function application and mapping

In [94]:
# Create a data frame
index = ['row2', 'row1', 'row4', 'row3']
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('dbc'), index=index)
# Lambda funtion, to calculate a max value of the calumn
f_square = lambda x: x**2
frame.apply(f_square)

Unnamed: 0,d,b,c
row2,0.14125,1.463325,0.517076
row1,0.887022,1.270036,0.700803
row4,1.94786,1.030065,1.983789
row3,0.023585,0.000667,0.379169


## Sorting

In [95]:
# Sort by index
frame.sort_index()

Unnamed: 0,d,b,c
row1,0.941819,1.126959,-0.83714
row2,-0.375832,1.20968,-0.71908
row3,-0.153574,-0.025827,-0.615767
row4,-1.395658,1.014921,-1.408471


## Handling Missing Data 

In [97]:
# Data with missing values
data = pd.Series(['Banana',np.nan , 'Orange', 'Apple', np.nan])
data

0    Banana
1       NaN
2    Orange
3     Apple
4       NaN
dtype: object

In [99]:
# Return an object containing boolean values indicating which values are missing
data.isnull()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [102]:
# Filtering out missing data
clean_data = data.dropna()
clean_data

0    Banana
2    Orange
3     Apple
dtype: object

In [104]:
# Filling in missing data
data.fillna(0)

0    Banana
1         0
2    Orange
3     Apple
4         0
dtype: object