# <center><div style="font-family: Times New Roman; border-radius : 10px; background-color: #000000; color: #00DDDE; padding: 12px; line-height: 1;">Imports</div></center>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# <center><div style="font-family: Times New Roman; border-radius : 10px; background-color: #000000; color: #00DDDE; padding: 12px; line-height: 1;">Exploring Train & Census Data</div></center>

**IMPORTANT** - the census data has a 2-year lag because census data comes out 2 years after it has been collected. Thus, the 2017 census data should be used for the 2019 train data, the 2018 census data should be used for the 2020 train data, etc.

In [3]:
census = pd.read_csv('data/census_starter.csv')
train = pd.read_csv('data/train.csv', parse_dates=['first_day_of_month'])

In [4]:
census.head()

Unnamed: 0,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,cfips,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,76.6,78.9,80.6,82.7,85.5,1001,14.5,15.9,16.1,16.7,...,1.3,1.1,0.7,0.6,1.1,55317,58786.0,58731,57982.0,62660.0
1,74.5,78.1,81.8,85.1,87.9,1003,20.4,20.7,21.0,20.2,...,1.4,1.3,1.4,1.0,1.3,52562,55962.0,58320,61756.0,64346.0
2,57.2,60.4,60.5,64.6,64.6,1005,7.6,7.8,7.6,7.3,...,0.5,0.3,0.8,1.1,0.8,33368,34186.0,32525,34990.0,36422.0
3,62.0,66.1,69.2,76.1,74.6,1007,8.1,7.6,6.5,7.4,...,1.2,1.4,1.6,1.7,2.1,43404,45340.0,47542,51721.0,54277.0
4,65.8,68.5,73.0,79.6,81.0,1009,8.7,8.1,8.6,8.9,...,1.3,1.4,0.9,1.1,0.9,47412,48695.0,49358,48922.0,52830.0


In [5]:
train.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243


Removing redundant information from the training dataset and separating the `first_day_of_month` column

In [6]:
train['year'] = train['first_day_of_month'].dt.year
train['month'] = train['first_day_of_month'].dt.month

train.drop(['row_id', 'county', 'state', 'first_day_of_month', 'active'], axis=1, inplace=True)

In [7]:
train.head()

Unnamed: 0,cfips,microbusiness_density,year,month
0,1001,3.007682,2019,8
1,1001,2.88487,2019,9
2,1001,3.055843,2019,10
3,1001,2.993233,2019,11
4,1001,2.993233,2019,12


Let's actually start working on combining the two datasets by matching the data and the years the data was collected during

In [8]:
census.head()

Unnamed: 0,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,cfips,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,76.6,78.9,80.6,82.7,85.5,1001,14.5,15.9,16.1,16.7,...,1.3,1.1,0.7,0.6,1.1,55317,58786.0,58731,57982.0,62660.0
1,74.5,78.1,81.8,85.1,87.9,1003,20.4,20.7,21.0,20.2,...,1.4,1.3,1.4,1.0,1.3,52562,55962.0,58320,61756.0,64346.0
2,57.2,60.4,60.5,64.6,64.6,1005,7.6,7.8,7.6,7.3,...,0.5,0.3,0.8,1.1,0.8,33368,34186.0,32525,34990.0,36422.0
3,62.0,66.1,69.2,76.1,74.6,1007,8.1,7.6,6.5,7.4,...,1.2,1.4,1.6,1.7,2.1,43404,45340.0,47542,51721.0,54277.0
4,65.8,68.5,73.0,79.6,81.0,1009,8.7,8.1,8.6,8.9,...,1.3,1.4,0.9,1.1,0.9,47412,48695.0,49358,48922.0,52830.0


In [48]:
def getStats(cfip, year):
  '''
  The function uses a `cfip` ID for a county and the `year` to be able to 
  find census statistics. 
  '''
  dif = year - 2019 # accounting for the 2-year lag
  index = [dif, dif + 6, dif + 11, dif + 16, dif + 21]
  return census[census['cfips'] == cfip].iloc[0][index]

In [56]:
import time
copy_train = train.copy()

# Adding the specific columns to train dataset
copy_train['pct_broadband_access'] = 0
copy_train['pct_college_degree'] = 0
copy_train['pct_foreign_born'] = 0
copy_train['pct_it_workers'] = 0
copy_train['median_household_income'] = 0

start = time.time()

# Running through all the data rows and imputing census data based on cfip ID and year
for i in range(len(copy_train)):
  cfip = copy_train['cfips'][i]
  year = copy_train['year'][i]
  stats = getStats(cfip, year)
    
  copy_train['pct_broadband_access'].iloc[i] = stats[0]
  copy_train['pct_college_degree'].iloc[i] = stats[1]
  copy_train['pct_foreign_born'].iloc[i] = stats[2]
  copy_train['pct_it_workers'].iloc[i] = stats[3]
  copy_train['median_household_income'].iloc[i] = stats[4]
  
  if (i % 10000 == 0):
    end = time.time()
    print(f'#{i} row processed in {(end - start):.4f} seconds')
    start = time.time()
    
print('Processing Completed!')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#0 row processed in 0.0103 seconds
#10000 row processed in 21.5451 seconds
#20000 row processed in 20.5724 seconds
#30000 row processed in 23.9757 seconds
#40000 row processed in 86.0192 seconds
#50000 row processed in 19.3896 seconds
#60000 row processed in 19.2440 seconds
#70000 row processed in 19.8399 seconds
#80000 row processed in 17.8517 seconds
#90000 row processed in 17.7706 seconds
#100000 row processed in 19.4139 seconds
#110000 row processed in 17.3618 seconds
#120000 row processed in 17.4652 seconds
Processing Completed!


In [62]:
copy_train[0:30]

Unnamed: 0,cfips,microbusiness_density,year,month,pct_broadband_access,pct_college_degree,pct_foreign_born,pct_it_workers,median_household_income
0,1001,3.007682,2019,8,76.6,14.5,2.1,1.3,55317.0
1,1001,2.88487,2019,9,76.6,14.5,2.1,1.3,55317.0
2,1001,3.055843,2019,10,76.6,14.5,2.1,1.3,55317.0
3,1001,2.993233,2019,11,76.6,14.5,2.1,1.3,55317.0
4,1001,2.993233,2019,12,76.6,14.5,2.1,1.3,55317.0
5,1001,2.96909,2020,1,78.9,15.9,2.0,1.1,58786.0
6,1001,2.909326,2020,2,78.9,15.9,2.0,1.1,58786.0
7,1001,2.933231,2020,3,78.9,15.9,2.0,1.1,58786.0
8,1001,3.000167,2020,4,78.9,15.9,2.0,1.1,58786.0
9,1001,3.004948,2020,5,78.9,15.9,2.0,1.1,58786.0
