Set up environment

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


Pandas Series Object

In [3]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [4]:
print(data.values)
print(data.index)

[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [5]:
# Accesing the data by index:
print(data[0], data[1:3], sep='\n')

0.25
1    0.50
2    0.75
dtype: float64


In [6]:
# Setting up index
data = pd.Series([0.25, 0.5, 0.75, 1.0], 
                 index=['a', 'b', 'c', 'd'])
print(data, data['a'], sep='\n')

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25


In [7]:
# Series from python dict
population_dict = {'California': 3738738,
                   'Texas': 234324,
                   'New York': 354354,
                   'Florida': 24314,
                   'Illoins': 232121}
population = pd.Series(population_dict)
print(population['California'])               # Show value
print(population['California':'Illoins'])     # Slicing like arrays

3738738
California    3738738
Florida         24314
Illoins        232121
dtype: int64


In [8]:
print(pd.Series({2: 'a', 1: 'b', 4: 'c', 3: 'd'}))
print(pd.Series({2: 'a', 1: 'b', 4: 'c', 3: 'd'}, index=[3, 2, 1]))

1    b
2    a
3    d
4    c
dtype: object
3    d
2    a
1    b
dtype: object


In [9]:
area_dict = {'California': 373,
             'Texas': 234,
             'New York': 354,
             'Florida': 243,
             'Illoins': 232}
area = pd.Series(area_dict)

In [10]:
states = pd.DataFrame({'Population': population, 'Area': area})
states

Unnamed: 0,Area,Population
California,373,3738738
Florida,243,24314
Illoins,232,232121
New York,354,354354
Texas,234,234324


In [11]:
print(states.columns)
print(states.index)
states['Area']      # This shows column called 'Area' and not row! 

Index(['Area', 'Population'], dtype='object')
Index(['California', 'Florida', 'Illoins', 'New York', 'Texas'], dtype='object')


California    373
Florida       243
Illoins       232
New York      354
Texas         234
Name: Area, dtype: int64

In [12]:
# Different ways of constructing Data Frames
# 1)From single Series object
population = pd.Series({'California': 37323,
             'Texas': 23432,
             'New York': 35124,
             'Florida': 24323,
             'Illoins': 23232})
print(pd.DataFrame(population, columns=['Population']))
# 2)From a list of dict
data = [{'a': i, 'b': i+1, 'c': i+2} for i in range(3)]
print(pd.DataFrame(data))
print(pd.DataFrame([{'a': 1, 'b': 2}, 
                   {'a': 1, 'c': 3}, 
                   {'b': 4, 'c': 5}]))
# 3) From dictionary of objects
print(pd.DataFrame({'Population': population}))
# 4) From two-dimensional NumPy array
print(pd.DataFrame(np.random.rand(3, 2),
                   columns=['foo', 'bar'],
                   index=['a', 'b', 'c']))
# 5) From a NumPy structured Array
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
print(pd.DataFrame(A, index=['a', 'b', 'c']))

            Population
California       37323
Florida          24323
Illoins          23232
New York         35124
Texas            23432
   a  b  c
0  0  1  2
1  1  2  3
2  2  3  4
     a    b    c
0  1.0  2.0  NaN
1  1.0  NaN  3.0
2  NaN  4.0  5.0
            Population
California       37323
Florida          24323
Illoins          23232
New York         35124
Texas            23432
        foo       bar
a  0.885327  0.210353
b  0.081165  0.661114
c  0.549123  0.828564
   A    B
a  0  0.0
b  0  0.0
c  0  0.0


In [13]:
# Index object
ind = pd.Index([2, 3, 4, 5, 7, 11])
print(ind[1], ind[::2])         # you can index elements inside Index object
try:
    ind[1] = 2
except TypeError:
    print('Index is immutable')
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])
print('Intersection: ', indA & indB)
print('Union: ', indA | indB)
print('Symmetric difference: ', indA & indB)


3 Int64Index([2, 4, 7], dtype='int64')
Index is immutable
Intersection:  Int64Index([3, 5, 7], dtype='int64')
Union:  Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')
Symmetric difference:  Int64Index([3, 5, 7], dtype='int64')


In [14]:
# Indexing of series
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data['b'], data['b':'c'], data[1:3], data[(data > 0.5) & (data < 1)],
      'a' in data, data.keys(), sep='\n')
data['a'] = 0.11
data['e'] = 1.11
print(data)

0.5
b    0.50
c    0.75
dtype: float64
b    0.50
c    0.75
dtype: float64
c    0.75
dtype: float64
True
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.11
b    0.50
c    0.75
d    1.00
e    1.11
dtype: float64


In [15]:
# Indexers
data = pd.Series(['a', 'b', 'c', 'd'], index=[1, 3, 7, 9])
# loc - are using indexers that were assigned to Series
print(data.loc[1], data.loc[1:3], sep='\n')
# iloc - are using indexes default by Python
print(data.iloc[1], data.iloc[1:3], sep='\n')

a
1    a
3    b
dtype: object
b
3    b
7    c
dtype: object


In [16]:
# Data Frame as dictionary
data = pd.DataFrame({'Population': population, 'Area': area})
print(data['Area'], data.Area, sep='\n')
data['Density'] = data['Population'] / data['Area']
data

California    373
Florida       243
Illoins       232
New York      354
Texas         234
Name: Area, dtype: int64
California    373
Florida       243
Illoins       232
New York      354
Texas         234
Name: Area, dtype: int64


Unnamed: 0,Area,Population,Density
California,373,37323,100.061662
Florida,243,24323,100.09465
Illoins,232,23232,100.137931
New York,354,35124,99.220339
Texas,234,23432,100.136752


In [17]:
data.T

Unnamed: 0,California,Florida,Illoins,New York,Texas
Area,373.0,243.0,232.0,354.0,234.0
Population,37323.0,24323.0,23232.0,35124.0,23432.0
Density,100.061662,100.09465,100.137931,99.220339,100.136752


In [18]:
# Explicit indexing
print(data.iloc[:2, :3], data.loc[:'Illoins', :'Population'], sep='\n')


            Area  Population     Density
California   373       37323  100.061662
Florida      243       24323  100.094650
            Area  Population
California   373       37323
Florida      243       24323
Illoins      232       23232


In [19]:
data.loc[data.Density < 100, ['Area']]

Unnamed: 0,Area
New York,354


In [20]:
data

Unnamed: 0,Area,Population,Density
California,373,37323,100.061662
Florida,243,24323,100.09465
Illoins,232,23232,100.137931
New York,354,35124,99.220339
Texas,234,23432,100.136752


In [21]:
data.loc['California', 'Population'] = 37399
data.loc['California', ['Population', 'Density']] = [37323, 100.06]
data.loc['California':'New York', 'Area'] = [245, 236, 345, 231]
data.Density = data.Population / data.Area
data['ID'] = [i + 1 for i in range(5)]
data

Unnamed: 0,Area,Population,Density,ID
California,245,37323,152.338776,1
Florida,236,24323,103.063559,2
Illoins,345,23232,67.33913,3
New York,231,35124,152.051948,4
Texas,234,23432,100.136752,5


In [23]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), 
                  columns=['A', 'B', 'C', 'D'])
np.exp(ser)
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [25]:
area = pd.Series({'Alaska': 12123, 'Texas': 232343, 'Cali': 34532}, name='Area')
population = pd.Series({'Cali': 34123, 'Texas': 34223, 'NewYork': 34211})
population / area

Alaska          NaN
Cali       0.988156
NewYork         NaN
Texas      0.147295
dtype: float64

In [51]:
# Simple example of index matching in those operations
A = pd.Series([1, 2, 3], index=[1, 2, 3])
B = pd.Series([4, 5, 6], index=[0, 2, 4])
A.add(B, fill_value=0)
# same as:
# A = pd.Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 4])
# B = pd.Series([4, 0, 5, 0, 6], index=[0, 1, 2, 3, 4])

0    4.0
1    1.0
2    7.0
3    3.0
4    6.0
dtype: float64

In [66]:
# Index alignment in Data Frame
A = pd.DataFrame(rng.randint(0, 10, (2, 2)), columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (4, 4)), columns=list('BCDE'))
print(A, B, A+B, sep='\n')
fill = B.stack().mean()
print(fill)
A.add(B, fill_value=fill)

   A  B
0  1  1
1  1  5
   B  C  D  E
0  2  8  3  0
1  3  0  4  3
2  7  7  6  2
3  0  0  2  5
    A    B   C   D   E
0 NaN  3.0 NaN NaN NaN
1 NaN  8.0 NaN NaN NaN
2 NaN  NaN NaN NaN NaN
3 NaN  NaN NaN NaN NaN
3.25


Unnamed: 0,A,B,C,D,E
0,4.25,3.0,11.25,6.25,3.25
1,4.25,8.0,3.25,7.25,6.25
2,,10.25,10.25,9.25,5.25
3,,3.25,3.25,5.25,8.25


0  B    2
   C    8
   D    3
   E    0
1  B    3
   C    0
   D    4
   E    3
2  B    7
   C    7
   D    6
   E    2
3  B    0
   C    0
   D    2
   E    5
dtype: int32

In [84]:
df = pd.DataFrame(rng.randint(10, size=(3, 4)), columns=list('QWER'))
print(df - df.iloc[0])
print(df.subtract(df['R'], axis=0))
print(df.subtract(df.iloc[0], axis=1))
df['T'] = (df.R - df.R.mean()) ** 2
df


   Q  W  E  R
0  0  0  0  0
1 -4  5  9  1
2 -2 -1  0  1
   Q  W  E  R
0  5  1 -3  0
1  0  5  5  0
2  2 -1 -4  0
   Q  W  E  R
0  0  0  0  0
1 -4  5  9  1
2 -2 -1  0  1


Unnamed: 0,Q,W,E,R,T
0,8,4,0,3,0.444444
1,4,9,9,4,0.111111
2,6,3,0,4,0.111111


MISSING DATA

In [99]:
# Pythonic missing values
vals1 = np.array([1, None, 3, 4])  # none is "object" so whole array will be Python "object" type
                                   # this means that it will calculated using python algoritghms 
                                   # and not optimized NumPy. Also it will show error if you sum it
                                   # with Null values
try:
    sum(vals1)
except TypeError:
    print('None cannot be summed')


None cannot be summed


In [104]:
# NumPy missing special Value NaN
vals2 = np.array([1, 2, np.nan, 4])

print(vals2, np.sum(vals2), np.nansum(vals2), sep='\n')

[ 1.  2. nan  4.]
nan
7.0


In [109]:
vals2 = pd.Series([1, np.nan, 2, None])
np.nansum(vals2)
vals2.sum()

3.0

In [118]:
vals2.isnull()
vals2.notnull()
vals2.dropna()
vals2.fillna(0)
vals2[vals2.notnull()]

0    1.0
2    2.0
dtype: float64

In [130]:
df = pd.DataFrame([[1, np.nan, 3],
                  [5, 3, 4],
                  [np.nan, 4, 3]])
df.dropna()
df.dropna(axis='columns')
df[3] = np.nan
df.dropna(axis='columns', how='all')
df.dropna(axis='rows', thresh=1)

Unnamed: 0,0,1,2,3
0,1.0,,3,
1,5.0,3.0,4,
2,,4.0,3,


In [150]:
# Filling null values
data = pd.Series([1, np.nan, 2.0, None, 3], index=list('abcde'))
data.fillna(0)                  # fill 0 for nan
data.fillna(method='ffill')     # forward fill
data.fillna(method='bfill')     # backward fill
df.fillna(axis=0, method='ffill')
df.fillna(axis=1, method='ffill')

Unnamed: 0,0,1,2,3
0,1.0,1.0,3.0,3.0
1,5.0,3.0,4.0,4.0
2,,4.0,3.0,3.0


In [156]:
# Multiply Index Series
index = [('California', 2000),
         ('California', 2001),
         ('Texas', 2000),
         ('Texas', 2001),
         ('New York', 2000),
         ('New York', 2001)]
pop = [3113231,
       3113211,
       3122121,
       3122331,
       31561331,
       31165331]
pop = pd.Series(pop, index=index)
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
pop['California']
pop[:, 2001]

California     3113211
Texas          3122331
New York      31165331
dtype: int64

In [165]:
data = pd.DataFrame([[123, 456],
                     [543, 321]],
                    index=['California', 'New York'],
                    columns=[2001, 2002])
data = pd.DataFrame({2001: [123, 543], 2002: [456, 321]}, 
                    index=['California', 'NewYork'])
data = data.stack()
data.unstack()

Unnamed: 0,2001,2002
California,123,456
NewYork,543,321


In [174]:
# Multiindex as another dimension
pop_df = pd.DataFrame({'Total': pop, 
                       'Under 18': [31200, 31300,
                                    31400, 31500,
                                    31600, 31700]})
f_u18 = pop_df['Under 18'] / pop_df['Total']
f_u18.unstack()

Unnamed: 0,2000,2001
California,0.010022,0.010054
New York,0.001001,0.001017
Texas,0.010057,0.010089


In [177]:
# Methods of constructing multiindex creation
df = pd.DataFrame(np.random.rand(4, 2),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=['data1', 'data2'])
df

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.278361,0.987233
a,2,0.096515,0.272009
b,1,0.281085,0.915196
b,2,0.213955,0.174926


In [197]:
# Dictionary with a tupples
data = {('California', 2010): [123, 120],
        ('California', 2011): [134, 130],
        ('Texas', 2010): [154, 150],
        ('Texas', 2011): [174, 170]}

data = pd.DataFrame(data, index=['Po1', 'Po2']).unstack()
data.index.names = ['State', 'Year', 'Pos']
data

State       Year  Pos
California  2010  Po1    123
                  Po2    120
            2011  Po1    134
                  Po2    130
Texas       2010  Po1    154
                  Po2    150
            2011  Po1    174
                  Po2    170
dtype: int64

In [201]:
# Multiindex example
index = pd.MultiIndex.from_product([[2012, 2013], [1, 2]], 
                                   names=['Year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guide', 'Sue'], ['HR', 'Temp']], 
                                     names=['subject', 'Type'])
data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37
health_data = pd.DataFrame(data, index=index, columns=columns)
health_data

Unnamed: 0_level_0,subject,Bob,Bob,Guide,Guide,Sue,Sue
Unnamed: 0_level_1,Type,HR,Temp,HR,Temp,HR,Temp
Year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2012,1,56.0,38.8,13.0,36.3,38.0,36.6
2012,2,41.0,36.7,40.0,36.3,33.0,37.2
2013,1,44.0,36.0,22.0,37.4,34.0,36.5
2013,2,37.0,37.2,51.0,38.4,26.0,37.5


In [219]:
health_data.loc[:, pd.IndexSlice[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guide,Sue
Unnamed: 0_level_1,Type,HR,HR,HR
Year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2012,1,56.0,13.0,38.0
2012,2,41.0,40.0,33.0
2013,1,44.0,22.0,34.0
2013,2,37.0,51.0,26.0


In [None]:
test
