### Pandas Idiom (Pandorable code)

In [12]:
import pandas as pd
import numpy as np
import timeit

df = pd.read_csv('census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [13]:
# Let's query our dataframe using chained operations
# and look how much time it takes to perform its work
def first_approach():
    global df
    return (
        df.where(df["SUMLEV"] == 50)
        .dropna()
        .set_index(["STNAME", "CTYNAME"])
        .rename(columns={"ESTIMATESBASE2010": "Estimates Base 2010"})
    )
df = pd.read_csv('census.csv')
timeit.timeit(first_approach, number=10)

0.4500243400000272

In [14]:
def second_approach():
    global df
    new_df = df[df["SUMLEV"] == 50]
    new_df.set_index(["STNAME", "CTYNAME"], inplace=True)
    return new_df.rename(columns={"ESTIMATESBASE2010": "Estimates Base 2010"})
df = pd.read_csv('census.csv')
timeit.timeit(second_approach, number=10)

0.09598739900002329

The second approach is much faster! So, this is a particular example of a classic time readability trade of

---

Now we have a new task. We want to compute min and max values for every rows. We can do that using loops. But Pandas has a great function named apply (similar to python map)

In [21]:
# Let's take a look
def min_max(row):
    data = row[[
        "POPESTIMATE2010",
        "POPESTIMATE2011",
        "POPESTIMATE2012",
        "POPESTIMATE2013",
        "POPESTIMATE2014",
        "POPESTIMATE2015",
    ]]
    return pd.Series({"min": np.min(data), "max": np.max(data)})
df.apply(min_max, axis="columns").head()

Unnamed: 0,min,max
0,4785161,4858979
1,54660,55347
2,183193,203709
3,26489,27341
4,22512,22861


In [19]:
# We can do that setting min and max values as columns like this.
def min_max(row):
    data = row[[
        "POPESTIMATE2010",
        "POPESTIMATE2011",
        "POPESTIMATE2012",
        "POPESTIMATE2013",
        "POPESTIMATE2014",
        "POPESTIMATE2015",
    ]]
    row["min"] = np.min(data)
    row["max"] = np.max(data)
    return row
df.apply(min_max, axis="columns").head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,min,max
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594,4785161,4858979
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333,54660,55347
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499,183193,203709
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299,26489,27341
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861,22512,22861


In [23]:
# Now do that using lambdas
rows = [
    "POPESTIMATE2010",
    "POPESTIMATE2011",
    "POPESTIMATE2012",
    "POPESTIMATE2013",
    "POPESTIMATE2014",
    "POPESTIMATE2015",
]
df.apply(lambda x: np.max(x[rows]), axis="columns")

0       4858979
1         55347
2        203709
3         27341
4         22861
         ...   
3188      45162
3189      23125
3190      21102
3191       8545
3192       7234
Length: 3193, dtype: int64

---

In [24]:
# Finally we want to divide the states into four categories: Northeast, Midwest, South, and West
def get_state_region(x):
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire',
                 'Rhode Island', 'Vermont', 'New York', 'New Jersey', 'Pennsylvania']
    midwest = ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa',
               'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota',
               'South Dakota']
    south = ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina',
             'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia',
             'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas',
             'Louisiana', 'Oklahoma', 'Texas']
    west = ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah',
            'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']

    if x in northeast:
        return "Northeast"
    elif x in midwest:
        return "Midwest"
    elif x in south:
        return "South"
    else:
        return "West"

# Here we are creating a column based on the return of apply function over STNAME column
df['state_region'] = df['STNAME'].apply(lambda x: get_state_region(x))

In [25]:
df[['STNAME','state_region']].head()

Unnamed: 0,STNAME,state_region
0,Alabama,South
1,Alabama,South
2,Alabama,South
3,Alabama,South
4,Alabama,South
