In [43]:
import pandas
pandas.__version__

'1.2.4'

# Introducing Pandas Objects

In [44]:
import numpy as np
import pandas as pd

### Series

In [45]:
s=pd.Series([33,44,67,19])
s

0    33
1    44
2    67
3    19
dtype: int64

In [46]:
s.values

array([33, 44, 67, 19], dtype=int64)

In [47]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [48]:
import pandas as pd
bacteria=pd.Series([122,2334,334,445],index=['a','b','c','d'])
bacteria

a     122
b    2334
c     334
d     445
dtype: int64

In [50]:
bacteria[2]

334

In [51]:
bacteria.name='counts'
bacteria.index.name='phylum'
bacteria

phylum
a     122
b    2334
c     334
d     445
Name: counts, dtype: int64

In [52]:
np.log(bacteria)

phylum
a    4.804021
b    7.755339
c    5.811141
d    6.098074
Name: counts, dtype: float64

In [53]:
bacteria[bacteria>1000]

phylum
b    2334
Name: counts, dtype: int64

In [54]:
name_dict={101:'john',102:'jonny',103:'tom'}
print(name_dict)
nm=pd.Series(name_dict)
nm

{101: 'john', 102: 'jonny', 103: 'tom'}


101     john
102    jonny
103      tom
dtype: object

## DataFrame: bi-dimensional Series with two (or more) indices

In [55]:
data={'privince':["FT","FH","FF"],
     'year':[2014,2013,2019],
     'literacy':[0.2,0.9,0.8]}
print(data)
data=pd.DataFrame(data)
data

{'privince': ['FT', 'FH', 'FF'], 'year': [2014, 2013, 2019], 'literacy': [0.2, 0.9, 0.8]}


Unnamed: 0,privince,year,literacy
0,FT,2014,0.2
1,FH,2013,0.9
2,FF,2019,0.8


In [56]:
chng=pd.DataFrame(data,columns=['year','privince','literacy'])
chng

Unnamed: 0,year,privince,literacy
0,2014,FT,0.2
1,2013,FH,0.9
2,2019,FF,0.8


In [57]:
chng['new_col']=chng.year/chng.literacy
chng

Unnamed: 0,year,privince,literacy,new_col
0,2014,FT,0.2,10070.0
1,2013,FH,0.9,2236.666667
2,2019,FF,0.8,2523.75


In [58]:
chng['new_series']=pd.Series(['a','b','c'], index=[0,1,2])
chng

Unnamed: 0,year,privince,literacy,new_col,new_series
0,2014,FT,0.2,10070.0,a
1,2013,FH,0.9,2236.666667,b
2,2019,FF,0.8,2523.75,c


In [60]:
pd.DataFrame(chng.to_dict())

Unnamed: 0,year,privince,literacy,new_col,new_series
0,2014,FT,0.2,10070.0,a
1,2013,FH,0.9,2236.666667,b
2,2019,FF,0.8,2523.75,c


### DataFrame as specialized dictionary

#### From a list of dicts

In [61]:
data=[{'a':i,'b':10*i}for i in range(5)]
print(data)
pd.DataFrame(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 10}, {'a': 2, 'b': 20}, {'a': 3, 'b': 30}, {'a': 4, 'b': 40}]


Unnamed: 0,a,b
0,0,0
1,1,10
2,2,20
3,3,30
4,4,40


In [62]:
chng['c']=pd.Series([3,3])
chng

Unnamed: 0,year,privince,literacy,new_col,new_series,c
0,2014,FT,0.2,10070.0,a,3.0
1,2013,FH,0.9,2236.666667,b,3.0
2,2019,FF,0.8,2523.75,c,


#### From a two-dimensional NumPy array

In [63]:
import numpy as np
import pandas as pd
pd.DataFrame(np.random.randint(2,12),columns=['aa','bb'],index=[1,2,3])

Unnamed: 0,aa,bb
1,11,11
2,11,11
3,11,11


## The Pandas Index Object

In [9]:
i=pd.Index([12,33,44,33,23,4,11])
i

Int64Index([12, 33, 44, 33, 23, 4, 11], dtype='int64')

### Index as immutable array

In [64]:
i[3]

33

In [11]:
i[2:4]

Int64Index([44, 33], dtype='int64')

In [12]:
print(i.size,i.shape,i.ndim,i.dtype)

7 (7,) 1 int64


In [13]:
i[1]=56

TypeError: Index does not support mutable operations

## Ufuncs: Index Preservation

In [14]:
rng=np.random.RandomState(15)
ser=pd.Series(rng.randint(0,10,5))
ser

0    8
1    5
2    5
3    7
4    0
dtype: int32

In [15]:
d=pd.DataFrame(rng.randint(0,8,(4,3)),columns=["a","b","c"])
d

Unnamed: 0,a,b,c
0,4,3,7
1,3,5,6
2,7,5,1
3,5,7,7


In [16]:
import numpy as np
np.exp(ser)

0    2980.957987
1     148.413159
2     148.413159
3    1096.633158
4       1.000000
dtype: float64

In [17]:
np.sin(d*np.pi/4)

Unnamed: 0,a,b,c
0,1.224647e-16,0.707107,-0.707107
1,0.7071068,-0.707107,-1.0
2,-0.7071068,-0.707107,0.707107
3,-0.7071068,-0.707107,-0.707107


### Index alignment in Series

In [18]:
area=pd.Series({'Alaska':435665,'texas':987658,'california':345677},name='area')
population=pd.Series({'california':3456789,'texas':9876549,'New york':234567},name='population')
print(area)
population


Alaska        435665
texas         987658
california    345677
Name: area, dtype: int64


california    3456789
texas         9876549
New york       234567
Name: population, dtype: int64

In [19]:
population/area

Alaska              NaN
New york            NaN
california    10.000055
texas          9.999969
dtype: float64

In [20]:
X=pd.Series([4,5,6,7],index=[0,1,2,3])
Y=pd.Series([2,3,42,2],index=[1,2,3,4])
print(X)
print(Y)
X+Y

0    4
1    5
2    6
3    7
dtype: int64
1     2
2     3
3    42
4     2
dtype: int64


0     NaN
1     7.0
2     9.0
3    49.0
4     NaN
dtype: float64

In [21]:
X.add(Y,fill_value=0)

0     4.0
1     7.0
2     9.0
3    49.0
4     2.0
dtype: float64

## Merge operations

In [30]:
data

Unnamed: 0,privince,year,literacy
0,FT,2014,0.2
1,FH,2013,0.9
2,FF,2019,0.8


In [33]:
df=pd.DataFrame({'privince':["FT","FH","ZH"], "Population":["100000","200000","300000"]})
df

Unnamed: 0,privince,Population
0,FT,100000
1,FH,200000
2,ZH,300000


In [35]:
  # merge is smart! If there are overlapping names, it uses those for the merge
data.merge(df)

Unnamed: 0,privince,year,literacy,Population
0,FT,2014,0.2,100000
1,FH,2013,0.9,200000


In [36]:
data.merge(df,right_on='privince',left_on='privince')

Unnamed: 0,privince,year,literacy,Population
0,FT,2014,0.2,100000
1,FH,2013,0.9,200000


### Combining data with overlap

In [38]:
a=pd.Series([np.nan,2.5,np.nan,3.4,4.5,np.nan],index=['f','e','d','c','b','a'])
b=pd.Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])

In [39]:
a

f    NaN
e    2.5
d    NaN
c    3.4
b    4.5
a    NaN
dtype: float64

In [40]:
b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [41]:
pd.Series(np.where(pd.isnull(a),b,a), index=a.index)

f    0.0
e    2.5
d    2.0
c    3.4
b    4.5
a    5.0
dtype: float64

In [66]:
a

f    NaN
e    2.5
d    NaN
c    3.4
b    4.5
a    NaN
dtype: float64