In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv

# Join and Merge on datasets

In [5]:
pop = pd.read_csv('state-population.csv')
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [18]:
pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2544 entries, 0 to 2543
Data columns (total 4 columns):
state/region    2544 non-null object
ages            2544 non-null object
year            2544 non-null int64
population      2524 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 79.6+ KB


In [8]:
pop.shape

(2544, 4)

In [23]:
abbrevs = pd.read_csv('state-abbrevs.csv')
abbrevs.head()


Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [24]:
abbrevs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
state           51 non-null object
abbreviation    51 non-null object
dtypes: object(2)
memory usage: 944.0+ bytes


In [14]:
pop['ages'].unique()

array(['under18', 'total'], dtype=object)

In [15]:
len(pop['state/region'].unique())

53

In [16]:
# pop[pop['state/region']=='AZ']['year'].max()
# len(pop['state/region'].unique())
pop.groupby('state/region')['year'].max()

state/region
AK     2013
AL     2013
AR     2013
AZ     2013
CA     2013
CO     2013
CT     2013
DC     2013
DE     2013
FL     2013
GA     2013
HI     2013
IA     2013
ID     2013
IL     2013
IN     2013
KS     2013
KY     2013
LA     2013
MA     2013
MD     2013
ME     2013
MI     2013
MN     2013
MO     2013
MS     2013
MT     2013
NC     2013
ND     2013
NE     2013
NH     2013
NJ     2013
NM     2013
NV     2013
NY     2013
OH     2013
OK     2013
OR     2013
PA     2013
PR     2013
RI     2013
SC     2013
SD     2013
TN     2013
TX     2013
USA    2013
UT     2013
VA     2013
VT     2013
WA     2013
WI     2013
WV     2013
WY     2013
Name: year, dtype: int64

In [17]:
# pop[pop.population.isnull()].head(20)

pop[pop.population.isnull()].head(20)

Unnamed: 0,state/region,ages,year,population
2448,PR,under18,1990,
2449,PR,total,1990,
2450,PR,total,1991,
2451,PR,under18,1991,
2452,PR,total,1993,
2453,PR,under18,1993,
2454,PR,under18,1992,
2455,PR,total,1992,
2456,PR,under18,1994,
2457,PR,total,1994,


In [None]:
areas = pd.read_csv('state-areas.csv')
areas.head()

In [None]:
len(areas['state'].unique())

In [25]:
abbrevs = pd.read_csv('state-abbrevs.csv')
abbrevs.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [None]:
len(abbrevs['state'].unique())

# Task 01: Explore the datasets
1) Number of columns and data points in each dataset

2) null values if any in each dataset

3) data types of each column

4) how many years of data is in population dataset for each state?

5) how many unique states are there in each dataframe? 


In [27]:
pop.describe()

Unnamed: 0,year,population
count,2544.0,2524.0
mean,2001.5,6805558.0
std,6.923547,28550140.0
min,1990.0,101309.0
25%,1995.75,742380.5
50%,2001.5,1597005.0
75%,2007.25,4547104.0
max,2013.0,316128800.0


In [33]:
pop[pop['state/region']=='AL']['year'].unique()

array([2012, 2010, 2011, 2009, 2013, 2007, 2008, 2005, 2006, 2004, 2003,
       2001, 2002, 1999, 2000, 1998, 1997, 1996, 1995, 1994, 1993, 1992,
       1991, 1990], dtype=int64)

In [34]:
len(pop[pop['state/region']=='AL']['year'].unique())

24

# Now Merge population and abbreviation dataframes


In [35]:
merged = pd.merge(pop, abbrevs, how='outer', left_on='state/region', right_on='abbreviation')
merged.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


In [36]:
merged = merged.drop('abbreviation', axis=1) # drop duplicate info
merged.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [37]:
merged.tail()

Unnamed: 0,state/region,ages,year,population,state
2539,USA,total,2010,309326295.0,
2540,USA,under18,2011,73902222.0,
2541,USA,total,2011,311582564.0,
2542,USA,under18,2012,73708179.0,
2543,USA,total,2012,313873685.0,


In [38]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
state/region    2544 non-null object
ages            2544 non-null object
year            2544 non-null int64
population      2524 non-null float64
state           2448 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB


In [40]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

### check if there are any null entries

In [41]:
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

### There are null values in afew column, let us extract the data points with null values

In [42]:
merged[merged['population'].isnull()].head()

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,


In [44]:
merged.loc[merged['state'].isnull(), 'state/region'].unique()

array(['PR', 'USA'], dtype=object)

### Replace the null entries

In [46]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

### Now the task is to merge the merged dataframe with areas dataframe!

In [None]:
merged.head()

In [None]:
areas.head()

In [None]:
len(merged.state.unique()), merged.state.unique()

In [None]:
len(areas.state.unique()), areas.state.unique()

In [None]:
final = pd.merge(merged, areas, on='state', how='right')
final.describe()

In [None]:
final.isnull().any()

In [None]:
final['state'][final['area (sq. mi)'].isnull()].unique()

In [None]:
final.dropna(inplace=True)
final.head()

# Task#02: Apply all the groupby methods on the final dataframe which we generated in the todays class. Please also share the insights you extracted from the analysis.


### Writing queries for the merged dataset

In [None]:
data2010 = final.query("year == 2010 & ages == 'total'")
data20101=final.loc[(final.year == 2010) & (final.ages=='under18')]
data2010.head()

In [None]:
data20101.describe()

In [None]:
data2010.describe()

### Creating new column/ feature by combning the existing features!

In [None]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']

In [None]:
density.sort_values(ascending=False, inplace=True)
density.head()

In [None]:
density.tail()

# Grouping and aggregation

In [None]:
import seaborn as sns
planets=sns.load_dataset('planets')
planets.head(10)

In [None]:
planets.info()

In [None]:
planets.method.unique()

# simple grouping returns a dataframe object

In [None]:
planets.groupby('method')['orbital_period'].std()

In [None]:
planets.describe()

# Aggregation
Apply different mathematical operations simultanouly on each column for different groups of data based on key column

In [None]:
planets.groupby('method').aggregate(['min', 'max',np.sum])
# planets.groupby('method').describe()

### You can also define a different mathematical operation for each column by using a dictionary!

In [None]:
planets.groupby('method').aggregate({'number':min,'orbital_period':np.median,'mass':[max,min]})

# Filtering
A filtering operation allows you to drop data based on the group properties.

In [None]:
def filter_func(x):
    return x['orbital_period'].std()>1
planets.groupby('method').std()

In [None]:
temp=planets.groupby('method').filter(filter_func)
temp.head()

In [None]:
temp.method.unique()

# Transformation
While aggregation must return a reduced version of the data, transformation can return some transformed version of the full data to recombine.

In [None]:
planets.head()

In [None]:
planets.groupby('method').mean()

In [None]:
planets.groupby('method').transform(lambda x:(x-x.mean())/x.std())

# Apply 
The apply() method lets you apply an arbitrary function to the group results. The function should take a DataFrame, and return either a Pandas object (e.g., DataFrame, Series) or a scalar; the combine operation will be tailored to the type of output.

In [None]:
def norm_by_data2(x):
    x['orbital_period']/=x['distance'].sum()
    return x
planets.groupby('method').apply(norm_by_data2)

In [None]:
planets.groupby('method').sum()

In [None]:
planets.head()

# Task#02: Apply all the groupby methods on the final dataframe which we generated in the todays class. Please also share the insights you extracted from the analysis.
