<h1>EDA using Python</h1>

<h2>Pandas Series Object</h2>

In [1]:
import numpy as np
import pandas as pd
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [2]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [3]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
df = pd.Series([0.25,0.5,0.75,1.0],index = ['a','b','c','d'])   #Series as a generalized numpy array
df['a']

0.25

In [8]:
population = {'California':3983910,'Texas':4578814,'Florida':7941346,'New York':614952,'Pennsylvania':845617}
population

{'California': 3983910,
 'Texas': 4578814,
 'Florida': 7941346,
 'New York': 614952,
 'Pennsylvania': 845617}

<h2>Pandas DataFrame Object</h2>

In [14]:
area = {'California':423967,'Texas':695662,'Florida':170312,'New York':141297,'Pennsylvania':119280}
area

{'California': 423967,
 'Texas': 695662,
 'Florida': 170312,
 'New York': 141297,
 'Pennsylvania': 119280}

In [17]:
#merging the above two series of population and area to form a dataframe
states = pd.DataFrame({'Population':population,'Area':area})
states

Unnamed: 0,Population,Area
California,3983910,423967
Texas,4578814,695662
Florida,7941346,170312
New York,614952,141297
Pennsylvania,845617,119280


In [18]:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [19]:
states.columns

Index(['Population', 'Area'], dtype='object')

In [20]:
states['Area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: Area, dtype: int64

<h2>Pandas Index Object</h2>

In [21]:
ind = pd.Index([2,3,5,7,11])
ind

Index([2, 3, 5, 7, 11], dtype='int64')

In [22]:
ind[1]

3

In [23]:
ind[::2]

Index([2, 5, 11], dtype='int64')

In [27]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

#intersection
print(indA.intersection(indB))

#union
print(indA.union(indB))

#Symmetric Difference
print(indA.symmetric_difference(indB))

Index([3, 5, 7], dtype='int64')
Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Index([1, 2, 9, 11], dtype='int64')


<h1>Data Indexing and Selection</h1>

<h2>Data Selection in Series</h2>

In [29]:
df = pd.Series([0.25,0.5,0.75,1.0],index = ['a','b','c','d'])
df['b']

0.5

In [33]:
df1 = pd.Series(['a','b','c'],index = [1,3,5])
df1

1    a
3    b
5    c
dtype: object

In [34]:
df1[1]  #Explicit Indexing

'a'

In [35]:
df1[1:3]  #Implicit Indexing

3    b
5    c
dtype: object

In [36]:
df1.loc[1]  #Explicit Indexing

'a'

In [37]:
df1.iloc[1:3]

3    b
5    c
dtype: object

<h2>Data Selection in DataFrames</h2>

In [39]:
population =pd.Series({'California':3983910,'Texas':4578814,'Florida':7941346,'New York':614952,'Pennsylvania':845617})
area = pd.Series({'California':423967,'Texas':695662,'Florida':170312,'New York':141297,'Pennsylvania':119280})
data =pd.DataFrame({'Population':population,'Area':area})
data

Unnamed: 0,Population,Area
California,3983910,423967
Texas,4578814,695662
Florida,7941346,170312
New York,614952,141297
Pennsylvania,845617,119280


In [41]:
data['Area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: Area, dtype: int64

In [44]:
data['Density']=data['Population']/data['Area']
data

Unnamed: 0,Population,Area,Density
California,3983910,423967,9.396746
Texas,4578814,695662,6.581952
Florida,7941346,170312,46.628223
New York,614952,141297,4.352194
Pennsylvania,845617,119280,7.089344


<h2>Operating on Data in Pandas</h2>

In [57]:
import numpy as np
import pandas as pd
rng = np.random.default_rng(42)   #generates a sequence of random numbers of seed - 42
ser = pd.DataFrame(rng.integers(0,10,(4,3)),columns=['A','B','C']) #framing a dataframe with 4rows and 3columns with column names.
ser

Unnamed: 0,A,B,C
0,0,7,6
1,4,4,8
2,0,6,2
3,0,5,9


In [58]:
np.exp(ser)

Unnamed: 0,A,B,C
0,1.0,1096.633158,403.428793
1,54.59815,54.59815,2980.957987
2,1.0,403.428793,7.389056
3,1.0,148.413159,8103.083928


<h2>Hierarchial Indexing</h2>

In [3]:
import pandas as pd
index = [('California', 2000), ('California', 2010),
 ('New York', 2000), ('New York', 2010),
 ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
 18976457, 19378102,
 20851820, 25145561]
pop = pd.Series(populations,index=index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [4]:
pop = pop.reindex(index)
pop

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [7]:
import numpy as np
df = pd.DataFrame(np.random.rand(4,2),index=[['a','a','b','b'],[1,2,1,2]],columns=['data1','data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.228183,0.834158
a,2,0.886149,0.517606
b,1,0.355754,0.385211
b,2,0.497085,0.111359


In [9]:
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type'])
#mock some data
data = np.round(np.random.randn(4,6),1)
data[:,::2]*=10
data+=37
#creating a dataframe
health_data = pd.DataFrame(data,index=index,columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,46.0,36.9,41.0,36.7,47.0,36.0
2013,2,48.0,36.8,44.0,35.5,11.0,36.3
2014,1,38.0,36.7,21.0,36.0,41.0,36.3
2014,2,41.0,37.5,27.0,37.6,43.0,35.1


<h2>Combining Datasets using Concat and Append</h2>

In [10]:
ser1 = pd.Series(['A','B','C'],index = [1,2,3])  #concatenation
ser2 = pd.Series(['D','E','F'],index = [4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [13]:
def make_df(columns, data):
    df = pd.DataFrame({col: data for col in columns})
    return df

df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
result = pd.concat([df5, df6])
display(result)

Unnamed: 0,A,B,C,D
0,1.0,1,1,
1,2.0,2,2,
0,,3,3,3.0
1,,4,4,4.0


In [14]:
display(pd.concat([df5,df6],join = 'inner'))

Unnamed: 0,B,C
0,1,1
1,2,2
0,3,3
1,4,4
