# Pandas

https://pandas.pydata.org/

10 Minutes to pandas

http://pandas.pydata.org/pandas-docs/stable/10min.html

Tutorials

http://pandas.pydata.org/pandas-docs/stable/tutorials.html

In [6]:
import numpy as np
import pandas as pd

**Series**

In [12]:
# pd.Series( data = [ ] , index = [ ] )

ser1 = pd.Series( [10, 20, 30] )
ser1

0    10
1    20
2    30
dtype: int64

In [13]:
ser1.index = ['a', 'b', 'c']
ser1

a    10
b    20
c    30
dtype: int64

In [14]:
ser2 = pd.Series( {'a':100, 'b':200, 'c':300} )
ser2

a    100
b    200
c    300
dtype: int64

In [24]:
ser3 = pd.Series( [1, 2, 3, 4], index = ['USA', 'Germany', 'France', 'Japan'] )
ser3

USA        1
Germany    2
France     3
Japan      4
dtype: int64

In [25]:
ser4 = pd.Series( [1, 2, 5, 4], index = ['USA', 'Germany', 'Italy', 'Japan'] )
ser4

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [27]:
ser3 + ser4

France     NaN
Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
dtype: float64

**DataFrame**

In [206]:
# pd.DataFrame( data = [ ], index = [ ], columns = [ ] )
df = pd.read_csv( 'gapminder.csv', index_col = 'Unnamed: 0' )
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population
0,2014,asia,Philippines,6598.0,70.7,100102249.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0
2,2014,asia,Palau,14078.0,,21094.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0


In [118]:
df['income'][:5]

0     6598.0
1     8038.0
2    14078.0
3     4619.0
4        NaN
Name: income, dtype: float64

In [119]:
df['gross_income'] = df['income'] * df['population']
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population,gross_income
0,2014,asia,Philippines,6598.0,70.7,100102249.0,660474600000.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0,52669670000.0
2,2014,asia,Palau,14078.0,,21094.0,296961300.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0,857038200000.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0,


In [120]:
df.drop( labels = 'gross_income', axis = 1, inplace = True )
df.head()

Unnamed: 0,year,continent,country,income,life_exp,population
0,2014,asia,Philippines,6598.0,70.7,100102249.0
1,2014,americas,Paraguay,8038.0,74.3,6552584.0
2,2014,asia,Palau,14078.0,,21094.0
3,2014,asia,Pakistan,4619.0,65.6,185546257.0
4,2014,americas,St.-Pierre-et-Miquelon,,,6277.0


Selecting rows and columns

In [121]:
df.loc[3]

year                 2014
continent            asia
country          Pakistan
income               4619
life_exp             65.6
population    1.85546e+08
Name: 3, dtype: object

In [122]:
# index location
from numpy.random import randn
np.random.seed(101)
sample_df = pd.DataFrame( randn(5, 4), index = ['a','b','c','d','e'], columns = ['w','x','y','z'] )
sample_df.iloc[2]   # same as sample_df.loc['c']

w   -2.018168
x    0.740122
y    0.528813
z   -0.589001
Name: c, dtype: float64

In [123]:
df.loc[ 5, 'country' ]

'Brazil'

In [128]:
df.loc[ [ 10, 100, 1000 ], ['continent', 'country'] ]

Unnamed: 0,continent,country
10,asia,Papua New Guinea
100,africa,Namibia
1000,africa,Cape Verde


Conditional Selection

In [156]:
df[ df['income'] > 50000 ]

Unnamed: 0,year,continent,country,income,life_exp,population
6,2014,europe,Norway,64020.0,82.0,5140311.0
27,2014,asia,"Macao, China",142893.0,80.61,588781.0
53,2014,asia,"Hong Kong, China",52552.0,83.56,7194563.0
56,2014,europe,Luxembourg,88203.0,82.1,556316.0
77,2014,asia,Kuwait,83394.0,80.2,3782450.0


In [157]:
# multiple conditions : & (and) , | (or)
df[  ( df['income'] > 50000 ) & ( df['life_exp'] > 80 )  ]

Unnamed: 0,year,continent,country,income,life_exp,population
6,2014,europe,Norway,64020.0,82.0,5140311.0
27,2014,asia,"Macao, China",142893.0,80.61,588781.0
53,2014,asia,"Hong Kong, China",52552.0,83.56,7194563.0
56,2014,europe,Luxembourg,88203.0,82.1,556316.0
77,2014,asia,Kuwait,83394.0,80.2,3782450.0


In [None]:
# Setting and resetting indices

In [211]:
df.set_index('year')

Unnamed: 0_level_0,continent,country,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,asia,Philippines,6598.0,70.7,100102249.0
2014,americas,Paraguay,8038.0,74.3,6552584.0
2014,asia,Palau,14078.0,,21094.0
2014,asia,Pakistan,4619.0,65.6,185546257.0
2014,americas,St.-Pierre-et-Miquelon,,,6277.0


In [155]:
df.dropna().reset_index()

Unnamed: 0,index,year,continent,country,income,life_exp,population
0,0,2014,asia,Philippines,6598.0,70.7,100102249.0
1,1,2014,americas,Paraguay,8038.0,74.3,6552584.0
2,3,2014,asia,Pakistan,4619.0,65.6,185546257.0
3,5,2014,americas,Brazil,15412.0,74.3,204213133.0
4,6,2014,europe,Norway,64020.0,82.0,5140311.0


Missing Data

In [162]:
df['income'].fillna( value = 0 )

0     6598.0
1     8038.0
2    14078.0
3     4619.0
4        0.0
Name: income, dtype: float64

Groupby

In [169]:
by_year = df.groupby('year')
by_year

<pandas.core.groupby.DataFrameGroupBy object at 0x0E5F07D0>

In [173]:
by_year.mean().tail(10)

Unnamed: 0_level_0,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005,16677.280788,69.524019,28116360.0
2006,17291.788177,69.85019,28450330.0
2007,17871.660099,70.139712,28788120.0
2008,17965.694581,70.447163,29129170.0
2009,17202.374384,70.76774,29126010.0
2010,17559.950739,70.969904,29483600.0
2011,18019.333333,71.324375,30488710.0
2012,18127.674877,71.663077,30857230.0
2013,18305.502463,71.916106,31226100.0
2014,18628.310345,72.088125,31593990.0


In [176]:
by_year.describe().tail()

Unnamed: 0_level_0,income,income,income,income,income,income,income,income,life_exp,life_exp,life_exp,life_exp,life_exp,population,population,population,population,population,population,population,population
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010,203.0,17559.950739,19685.759131,614.0,3394.0,10515.0,24702.0,127984.0,208.0,70.969904,...,77.65,84.7,236.0,29483600.0,123694900.0,50.0,363591.25,4586104.0,17737639.25,1359755000.0
2011,204.0,18019.333333,20420.280357,614.0,3508.75,11049.0,25232.0,133734.0,208.0,71.324375,...,77.825,84.7,231.0,30488710.0,126114700.0,796.0,434907.0,5174061.0,20303992.0,1367480000.0
2012,203.0,18127.674877,20504.649685,616.0,3675.0,11046.0,24876.0,130990.0,208.0,71.663077,...,78.125,84.7,231.0,30857230.0,127259600.0,804.0,436195.5,5267839.0,20295978.0,1375199000.0
2013,203.0,18305.502463,20782.622473,584.0,3788.0,11405.0,25029.5,136540.0,208.0,71.916106,...,78.3,84.8,231.0,31226100.0,128387700.0,801.0,437292.0,5360837.0,19938671.0,1382793000.0
2014,203.0,18628.310345,21262.409355,578.0,3796.5,11514.0,25786.5,142893.0,208.0,72.088125,...,78.4,84.8,231.0,31593990.0,129497000.0,800.0,438227.0,5448342.0,19587913.0,1390110000.0


In [223]:
by_year.describe()[ 'income' ]

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1800,201.0,946.288557,502.228769,350.0,608.0,850.0,1097.0,4235.0
1801,201.0,946.661692,500.413744,350.0,608.0,852.0,1101.0,4161.0
1802,201.0,949.452736,510.208133,350.0,608.0,853.0,1105.0,4391.0
1803,201.0,949.19403,505.071586,350.0,609.0,854.0,1110.0,4297.0
1804,201.0,950.751244,512.245476,350.0,609.0,854.0,1114.0,4502.0


Multi-Index and Index Hierarchy

In [193]:
outside = df['year']
inside = df['continent']
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [195]:
hier_index

MultiIndex(levels=[[1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 19

In [214]:
asia_df = df[ df['continent'] == 'asia' ]
amer_df = df[ df['continent'] == 'americas' ]
eur_df = df[ df['continent'] == 'europe' ]
afr_df = df[ df['continent'] == 'africa' ]

In [217]:
asia_df.groupby( by = 'year' ).mean()

Unnamed: 0_level_0,income,life_exp,population
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1800,853.183333,29.224576,8.571324e+06
1801,853.716667,29.224407,8.613720e+06
1802,854.366667,29.216780,8.656434e+06
1803,854.883333,29.199661,8.699470e+06
1804,855.450000,29.191017,8.742830e+06
1805,855.983333,29.223729,8.786517e+06
1806,856.566667,29.223559,8.830532e+06
1807,857.033333,29.223390,8.874880e+06
1808,857.666667,29.223220,8.919563e+06
1809,858.200000,29.223051,8.964583e+06
