<a href="https://colab.research.google.com/github/Meghanap18/INTERNSOFTcodefiles/blob/main/Day7_pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing and Using Pandas

In [None]:
import pandas
pandas.__version__

'1.1.5'

Introducing Pandas Objects

In [None]:
import numpy as np
import pandas as pd

Series

In [None]:
counts = pd.Series([632, 1638, 569, 115])
counts

0     632
1    1638
2     569
3     115
dtype: int64

In [None]:
counts.values

array([ 632, 1638,  569,  115])

In [None]:
counts.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
bacteria = pd.Series([632, 1638, 569, 115], 
    index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])

bacteria

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
dtype: int64

In [None]:
bacteria['Actinobacteria']

569

In [None]:
bacteria[0]

632

In [None]:
bacteria.name = 'counts'
bacteria.index.name = 'phylum'
bacteria

phylum
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
Name: counts, dtype: int64

In [None]:
np.log(bacteria)

phylum
Firmicutes        6.448889
Proteobacteria    7.401231
Actinobacteria    6.343880
Bacteroidetes     4.744932
Name: counts, dtype: float64

In [None]:
bacteria[bacteria>1000]

phylum
Proteobacteria    1638
Name: counts, dtype: int64

In [None]:
bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1500, 'Actinobacteria': 569, 'Bacteroidetes': 115}
print(bacteria_dict)
pd.Series(bacteria_dict)

{'Firmicutes': 632, 'Proteobacteria': 1500, 'Actinobacteria': 569, 'Bacteroidetes': 115}


Firmicutes         632
Proteobacteria    1500
Actinobacteria     569
Bacteroidetes      115
dtype: int64

DataFrame: bi-dimensional Series with two (or more) indices

In [None]:
data = {"Province": ["FL", "FL", "NH", "NH", "ZH"],
        "Year": [2013, 2014, 2013, 2014, 2014],
        "Literacy": [0.2, 0.1, 0.5, 0.3, 0.5]}
print(data)
data = pd.DataFrame(data)
data

{'Province': ['FL', 'FL', 'NH', 'NH', 'ZH'], 'Year': [2013, 2014, 2013, 2014, 2014], 'Literacy': [0.2, 0.1, 0.5, 0.3, 0.5]}


Unnamed: 0,Province,Year,Literacy
0,FL,2013,0.2
1,FL,2014,0.1
2,NH,2013,0.5
3,NH,2014,0.3
4,ZH,2014,0.5


In [None]:
df = pd.DataFrame(data, columns=["Year", "Province" ,"Literacy"])
df

Unnamed: 0,Year,Province,Literacy
0,2013,FL,0.2
1,2014,FL,0.1
2,2013,NH,0.5
3,2014,NH,0.3
4,2014,ZH,0.5


In [None]:
df['nonsense'] = df.Year / df.Literacy
df

Unnamed: 0,Year,Province,Literacy,nonsense
0,2013,FL,0.2,10065.0
1,2014,FL,0.1,20140.0
2,2013,NH,0.5,4026.0
3,2014,NH,0.3,6713.333333
4,2014,ZH,0.5,4028.0


In [None]:
df['Serie_aligned'] = pd.Series(range(5), index=[0,1,2, 3, 4])
df

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned
0,2013,FL,0.2,10065.0,0
1,2014,FL,0.1,20140.0,1
2,2013,NH,0.5,4026.0,2
3,2014,NH,0.3,6713.333333,3
4,2014,ZH,0.5,4028.0,4


In [None]:
df.to_dict()

{'Literacy': {0: 0.2, 1: 0.1, 2: 0.5, 3: 0.3, 4: 0.5},
 'Province': {0: 'FL', 1: 'FL', 2: 'NH', 3: 'NH', 4: 'ZH'},
 'Serie_aligned': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
 'Year': {0: 2013, 1: 2014, 2: 2013, 3: 2014, 4: 2014},
 'nonsense': {0: 10065.0,
  1: 20140.0,
  2: 4026.0,
  3: 6713.333333333334,
  4: 4028.0}}

In [None]:
pd.DataFrame(df.to_dict())

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned
0,2013,FL,0.2,10065.0,0
1,2014,FL,0.1,20140.0,1
2,2013,NH,0.5,4026.0,2
3,2014,NH,0.3,6713.333333,3
4,2014,ZH,0.5,4028.0,4


From a list of dicts

In [None]:
data = [{'a': i, 'b':10* i}for i in range(6)]
print(data)
pd.DataFrame(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 10}, {'a': 2, 'b': 20}, {'a': 3, 'b': 30}, {'a': 4, 'b': 40}, {'a': 5, 'b': 50}]


Unnamed: 0,a,b
0,0,0
1,1,10
2,2,20
3,3,30
4,4,40
5,5,50


In [None]:
pd.DataFrame([{'aa': 1, 'bb': 2}, {'bb': 3, 'cc': 6}])

Unnamed: 0,aa,bb,cc
0,1.0,2,
1,,3,6.0


From a two-dimensional NumPy array

In [None]:
pd.DataFrame(np.random.randint(2, 12),
             columns=['foo', 'bar'],
             index=['a', 'b', 'c'])


Unnamed: 0,foo,bar
a,2,2
b,2,2
c,2,2


The Pandas Index Objec

In [None]:
ind = pd.Index([20, 34, 57, 7, 1, 8])
ind

Int64Index([20, 34, 57, 7, 1, 8], dtype='int64')

Index as immutable array

In [None]:
ind[1]

34

In [None]:
ind[::]

Int64Index([20, 34, 57, 7, 1, 8], dtype='int64')

In [None]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

6 (6,) 1 int64


In [None]:
ind[1] = 0

TypeError: ignored

Ufuncs: Index Preservation

In [None]:
rng = np.random.RandomState(15)
ser = pd.Series(rng.randint(0, 10, 4))
ser

0    8
1    5
2    5
3    7
dtype: int64

In [None]:
dfr = pd.DataFrame(rng.randint(0, 10, (5, 4)),
                  columns=['A', 'B', 'C', 'D'])
dfr

Unnamed: 0,A,B,C,D
0,0,7,5,6
1,1,7,0,4
2,9,7,5,3
3,6,8,2,1
4,1,0,5,2


In [None]:
np.exp(ser)

0    2980.957987
1     148.413159
2     148.413159
3    1096.633158
dtype: float64

In [None]:
np.sin(dfr * np.pi / 4)

Unnamed: 0,A,B,C,D
0,0.0,-0.7071068,-0.707107,-1.0
1,0.707107,-0.7071068,0.0,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068
3,-1.0,-2.449294e-16,1.0,0.7071068
4,0.707107,0.0,-0.707107,1.0


Index alignment in Series

In [None]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')
print(area)
population

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64


California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [None]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [None]:
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A)
print(B)
B
A + B

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [None]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

Merge operations

In [None]:
df

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned
0,2013,FL,0.2,10065.0,0
1,2014,FL,0.1,20140.0,1
2,2013,NH,0.5,4026.0,2
3,2014,NH,0.3,6713.333333,3
4,2014,ZH,0.5,4028.0,4


In [None]:
df2 = pd.DataFrame({"Province": ["FL", "NH", "ZH"], "Population": ["100000", "200000", "300000"]})
df2

Unnamed: 0,Province,Population
0,FL,100000
1,NH,200000
2,ZH,300000


In [None]:
df.merge(df2)  # merge is smart! If there are overlapping names, it uses those for the merge

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,Population
0,2013,FL,0.2,10065.0,0,100000
1,2014,FL,0.1,20140.0,1,100000
2,2013,NH,0.5,4026.0,2,200000
3,2014,NH,0.3,6713.333333,3,200000
4,2014,ZH,0.5,4028.0,4,300000


In [None]:
df3 = pd.DataFrame({"province": ["FL", "NH"], "Population": ["100000", "200000"]})
df3
df.merge(df3, right_on='province', left_on='Province')

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,province,Population
0,2013,FL,0.2,10065.0,0,FL,100000
1,2014,FL,0.1,20140.0,1,FL,100000
2,2013,NH,0.5,4026.0,2,NH,200000
3,2014,NH,0.3,6713.333333,3,NH,200000


In [None]:
df4 = pd.DataFrame({"Province": ["FL", "NH", "UT"], "Population": ["100000", "200000", "50000"]})
df.merge(df4, how='outer')

Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,Population
0,2013.0,FL,0.2,10065.0,0.0,100000.0
1,2014.0,FL,0.1,20140.0,1.0,100000.0
2,2013.0,NH,0.5,4026.0,2.0,200000.0
3,2014.0,NH,0.3,6713.333333,3.0,200000.0
4,2014.0,ZH,0.5,4028.0,4.0,
5,,UT,,,,50000.0


In [None]:
df5 = pd.DataFrame({"Province": ["FL", "NH", "FL"], "Population": ["100000", "200000", "50000"]})
print(df)
df.merge(df5, how='outer')

   Year Province  Literacy      nonsense  Serie_aligned
0  2013       FL       0.2  10065.000000              0
1  2014       FL       0.1  20140.000000              1
2  2013       NH       0.5   4026.000000              2
3  2014       NH       0.3   6713.333333              3
4  2014       ZH       0.5   4028.000000              4


Unnamed: 0,Year,Province,Literacy,nonsense,Serie_aligned,Population
0,2013,FL,0.2,10065.0,0,100000.0
1,2013,FL,0.2,10065.0,0,50000.0
2,2014,FL,0.1,20140.0,1,100000.0
3,2014,FL,0.1,20140.0,1,50000.0
4,2013,NH,0.5,4026.0,2,200000.0
5,2014,NH,0.3,6713.333333,3,200000.0
6,2014,ZH,0.5,4028.0,4,


Combining data with overlap

In [None]:
serie_a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                     index=['f', 'e', 'd', 'c', 'b', 'a'])
serie_b = pd.Series(np.arange(len(serie_a), dtype=np.float64),
                 index=['f', 'e', 'd', 'c', 'b', 'a'])

In [None]:
serie_a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [None]:
serie_b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [None]:
pd.Series(np.where(pd.isnull(serie_a), serie_b, serie_a), index=serie_a.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

In [None]:
serie_a.combine_first(serie_b)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64