# Studying and apply Pandas using the book : Python Data Science Handbook From Jake VanderPlas

## Pandas Series Object


In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.Series([0, 0.25, 0.5, 0.75, 1])
data

0    0.00
1    0.25
2    0.50
3    0.75
4    1.00
dtype: float64

In [3]:
data.values

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [4]:
data[3]

0.75

In [5]:
data[1:4]

1    0.25
2    0.50
3    0.75
dtype: float64

In [6]:
#use string how index
data = pd.Series([0, 0.25, 0.5, 0.75, 1],index = ['a', 'b', 'c', 'd', 'e'])  
data

a    0.00
b    0.25
c    0.50
d    0.75
e    1.00
dtype: float64

In [7]:
data['b']

0.25

In [8]:
#dictionary
states = {'California': 324375464353,
          'Texas': 323452323432,
          'New York': 212435090453,
          'Illinois': 89456436135466,
          'Florida': 76765543754765}

population = pd.Series(states)
population

California      324375464353
Texas           323452323432
New York        212435090453
Illinois      89456436135466
Florida       76765543754765
dtype: int64

In [9]:
population['California']

324375464353

In [10]:
population['Texas': 'Illinois']

Texas         323452323432
New York      212435090453
Illinois    89456436135466
dtype: int64

In [11]:
area = {'California':12456, 'Texas': 647532, 'New York': 9874747, 'Illinois': 341324, 'Florida': 7461213}
area

{'California': 12456,
 'Florida': 7461213,
 'Illinois': 341324,
 'New York': 9874747,
 'Texas': 647532}

In [12]:
#single two-dimensional object
organize = pd.DataFrame({'population': population, 'area': area})
organize

Unnamed: 0,population,area
California,324375464353,12456
Texas,323452323432,647532
New York,212435090453,9874747
Illinois,89456436135466,341324
Florida,76765543754765,7461213


In [13]:
organize.index

Index(['California', 'Texas', 'New York', 'Illinois', 'Florida'], dtype='object')

In [14]:
organize.columns

Index(['population', 'area'], dtype='object')

In [15]:
organize['area']

California      12456
Texas          647532
New York      9874747
Illinois       341324
Florida       7461213
Name: area, dtype: int64

In [16]:
pd.DataFrame({'population': population, 'area':area})

Unnamed: 0,population,area
California,324375464353,12456
Texas,323452323432,647532
New York,212435090453,9874747
Illinois,89456436135466,341324
Florida,76765543754765,7461213


In [17]:
#Index as immutable array
ind = pd.Series([2, 3, 4, 5,66,])
ind

0     2
1     3
2     4
3     5
4    66
dtype: int64

In [18]:
ind[0] = 11
ind

0    11
1     3
2     4
3     5
4    66
dtype: int64

In [19]:
ind[::2]

0    11
2     4
4    66
dtype: int64

In [20]:
#ntersections
indA = pd.Index([1, 2, 3, 4, 5,])
indB = pd.Index([2, 2, 3, 4, 12])

indA & indB

Int64Index([2, 2, 3, 4], dtype='int64')

In [21]:
# union
indA | indB

Int64Index([1, 2, 2, 3, 4, 5, 12], dtype='int64')

In [22]:
# symmetric difference
indA ^ indB

Int64Index([1, 5, 12], dtype='int64')

## Data Indexing and Selection

In [23]:
data =  pd.Series ([1, 2, 3, 4, 5,],
                   index = ['a', 'b', 'c', 'd', 'e'])
data

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [24]:
'a' in data

True

In [25]:
data.keys()

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [26]:
list(data.items())

[('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]

In [27]:
#can be modified
data['f'] = 6
data

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [28]:
# slicing by explicit index
data['b': 'e']

b    2
c    3
d    4
e    5
dtype: int64

In [29]:
data[3:6]

d    4
e    5
f    6
dtype: int64

In [30]:
data



a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

In [31]:
data[1:3]

b    2
c    3
dtype: int64

In [32]:
data.loc['b':'d']

b    2
c    3
d    4
dtype: int64

In [33]:
data.iloc[1]

2

In [34]:
data.iloc[2:4]

c    3
d    4
dtype: int64

In [35]:
#use DataFrame for organize! 
area = pd.Series({'California': 423967, 'Texas': 695662,'New York': 141297, 'Florida': 170312,'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,'New York': 19651127, 'Florida': 19552860,'Illinois': 12882135})

data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [36]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [37]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [38]:
data.pop #not use this


<bound method DataFrame.pop of               area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135>

In [39]:
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [40]:
data.area is data['area']

True

In [41]:
#for add 1 more item
data['density'] = data['pop']/data['area']

data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [42]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [43]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [44]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [45]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [46]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [47]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [48]:
data.loc[data.density > 100,['pop', 'density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [49]:
data.iloc[0 ,2 ] = 99   #change density from california

data

Unnamed: 0,area,pop,density
California,423967,38332521,99.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [50]:
data.iloc[1,2]= 200
data

Unnamed: 0,area,pop,density
California,423967,38332521,99.0
Texas,695662,26448193,200.0
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [51]:
data[data.density > 100]

Unnamed: 0,area,pop,density
Texas,695662,26448193,200.0
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


## Operating on Data in Pandas

In [52]:
import pandas as pd
import numpy as np

In [53]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 11, 5))

ser

0     6
1     3
2    10
3     7
4     4
dtype: int64

In [54]:
df = pd.DataFrame(rng.randint(0, 10,(3, 4)),
                              columns = ['A','B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [55]:
np.exp(ser)   #calculate exponential

0      403.428793
1       20.085537
2    22026.465795
3     1096.633158
4       54.598150
dtype: float64

In [56]:
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [57]:
population /area

California    7.650960e+05
Florida       4.507348e+08
Illinois      5.963961e+08
New York      1.503465e+06
Texas         4.649561e+05
dtype: float64

In [58]:
area.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [59]:
A = pd.Series([2, 3, 5], 
              index = [1, 3, 7])
B= pd.Series([3, 4, 5], 
             index = [1, 5, 7])

A+B

1     5.0
3     NaN
5     NaN
7    10.0
dtype: float64

In [60]:
#for fill the NAN values with numbers
A.add(B, fill_value=0)

1     5.0
3     3.0
5     4.0
7    10.0
dtype: float64

In [61]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns = list('AB'))
A

Unnamed: 0,A,B
0,1,11
1,5,1


In [62]:
B = pd.DataFrame(rng.randint(0,12,(3,3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,11,4,0
1,11,9,5
2,11,8,0


In [63]:
A + B

Unnamed: 0,A,B,C
0,5.0,22.0,
1,14.0,12.0,
2,,,


In [64]:
A.add(B, fill_value = 0)

Unnamed: 0,A,B,C
0,5.0,22.0,0.0
1,14.0,12.0,5.0
2,8.0,11.0,0.0


In [65]:
#way 2 , fill with the mean
fill = A.stack().mean()
A.add(B, fill_value = fill)

Unnamed: 0,A,B,C
0,5.0,22.0,4.5
1,14.0,12.0,9.5
2,12.5,15.5,4.5


In [66]:
A = rng.randint(10, size = (3,4))
A

array([[9, 2, 6, 3],
       [8, 2, 4, 2],
       [6, 4, 8, 6]])

In [67]:
A - A[0]

array([[ 0,  0,  0,  0],
       [-1,  0, -2, -1],
       [-3,  2,  2,  3]])

In [68]:
A - A[-1]

array([[ 3, -2, -2, -3],
       [ 2, -2, -4, -4],
       [ 0,  0,  0,  0]])

In [69]:
df = pd.DataFrame(A, columns=list('QAZP'))
df - df.iloc[0]

Unnamed: 0,Q,A,Z,P
0,0,0,0,0
1,-1,0,-2,-1
2,-3,2,2,3


In [70]:
df.subtract(df['Z'], axis = 0)

Unnamed: 0,Q,A,Z,P
0,3,-4,0,-3
1,4,-2,0,-2
2,-2,-4,0,-2


In [71]:
kat = np.array([1, None, 3, 4])
kat

array([1, None, 3, 4], dtype=object)

In [72]:
for dtype in ['object', 'int']:
  print('dtype =', dtype)
  %timeit np.arange(1E6, dtype=dtype).sum()
  print()

dtype = object
10 loops, best of 3: 68.4 ms per loop

dtype = int
100 loops, best of 3: 2.63 ms per loop



In [73]:
kat.sum()    #This reflects the fact that addition between an integer and None is undefinedx

TypeError: ignored

In [None]:
x= pd.Series([1, np.nan, 2, None])
x


In [None]:
pd.Series(range(2), dtype=int)

In [None]:
x[0]= None
x

In [None]:
data = pd.Series([1, np.nan, 2 , None])   #tell if the results are null or not
data.isnull()

In [None]:
data[data.notnull()]  #don't show nulls

In [74]:
data.dropna()

Unnamed: 0,area,pop,density
California,423967,38332521,99.0
Texas,695662,26448193,200.0
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [76]:
df = pd.DataFrame([[1, np.nan, 3],
                  [1, 4, 6],
                  [np.nan, 1, 9]])
df

Unnamed: 0,0,1,2
0,1.0,,3
1,1.0,4.0,6
2,,1.0,9


In [78]:
df.dropna()   #show just the column that not have "null"

Unnamed: 0,0,1,2
1,1.0,4.0,6


In [80]:
#Filling null values
data = pd.Series([1, np.nan, 2, None, 3], index = list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [81]:
data.fillna(0)  #with zero for example

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [82]:
# forward-fill
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [83]:
df

Unnamed: 0,0,1,2
0,1.0,,3
1,1.0,4.0,6
2,,1.0,9


In [84]:
df.fillna(method= 'ffill', axis = 1)

Unnamed: 0,0,1,2
0,1.0,1.0,3.0
1,1.0,4.0,6.0
2,,1.0,9.0
