# Data exploration and assessment

## Airport data

In [4]:
import pandas as pd

airports_file = 'airport-codes_csv.csv'
airports_df = pd.read_csv(airports_file)

print(airports_df.shape)

(55075, 12)


In [7]:
# pandas interprets continent = NA (North America) as nulls
# we manually add these back in for this stage, but note that it will not be necessary in the ETL pipeline as PySpark 
# does not make the same mistake
airports_df.loc[airports_df['continent'].isna(), 'continent'] = 'NA'
airports_df.isna().sum()/airports_df.shape[0]

ident           0.000000
type            0.000000
name            0.000000
elevation_ft    0.127208
continent       0.000000
iso_country     0.004485
iso_region      0.000000
municipality    0.103059
gps_code        0.255016
iata_code       0.833155
local_code      0.479147
coordinates     0.000000
dtype: float64

#### Around 83% of entries have no iata_code value. This will be an intesive task to update without providing much value. We will drop this column. In addition, we will not need to include gps_code or local_code, so these will also be removed. 

In [8]:
# the iso_region column contains the US state code
# we will have to manuualy extract the two-letter code from the column
airports_df = airports_df.replace({'US-': ''}, regex=True)
airports_df.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,AR,Newport,,,,"-91.254898, 35.6087"


## US cities demographics data

In [9]:
demographics_file = 'us-cities-demographics.csv'
demo_df = pd.read_csv(demographics_file, delimiter=';')

print(demo_df.shape)

(2891, 12)


#### There are only 16 rows with any nulls, covering 8 cities, all of which are for relatively small. We disclude these. 

In [13]:
demo_df[demo_df.isna().any(axis=1)]

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
111,San Juan,Puerto Rico,41.4,155408.0,186829.0,342237,,,,PR,Hispanic or Latino,335559
155,Caguas,Puerto Rico,40.4,34743.0,42265.0,77008,,,,PR,Hispanic or Latino,76349
258,Carolina,Puerto Rico,42.0,64758.0,77308.0,142066,,,,PR,American Indian and Alaska Native,12143
333,The Villages,Florida,70.5,,,72590,15231.0,4034.0,,FL,Hispanic or Latino,1066
449,The Villages,Florida,70.5,,,72590,15231.0,4034.0,,FL,Black or African-American,331
637,Carolina,Puerto Rico,42.0,64758.0,77308.0,142066,,,,PR,Hispanic or Latino,139967
1437,The Villages,Florida,70.5,,,72590,15231.0,4034.0,,FL,White,72211
1747,San Juan,Puerto Rico,41.4,155408.0,186829.0,342237,,,,PR,American Indian and Alaska Native,4031
1748,Mayagüez,Puerto Rico,38.1,30799.0,35782.0,66581,,,,PR,Asian,235
1995,Ponce,Puerto Rico,40.5,56968.0,64615.0,121583,,,,PR,Hispanic or Latino,120705


#### Each city has counts for between 1 and 5 races, which creates duplicate rows. For around 79% of the rows, all but two of the columns (Race and Count) are redundant. We'll create a new table which contains the race data, and remove it from the demographics table to reduce storage space required. 

In [14]:
min_races = min(demo_df.groupby(['City', 'State']).size())
max_races = max(demo_df.groupby(['City', 'State']).size())
print(min_races, max_races)

1 5


In [15]:
# Proportion of duplicated rows when Race and Count columns are removed
1-demo_df[['City', 'State']].drop_duplicates().shape[0]/demo_df.shape[0]

0.7938429609131789

## Temperature data

In [16]:
temperature_file = '../../data2/GlobalLandTemperaturesByCity.csv'
temp_df = pd.read_csv(temperature_file)

temp_df.head()

           dt  AverageTemperature  AverageTemperatureUncertainty   City  \
0  1743-11-01               6.068                          1.737  Århus   
1  1743-12-01                 NaN                            NaN  Århus   
2  1744-01-01                 NaN                            NaN  Århus   
3  1744-02-01                 NaN                            NaN  Århus   
4  1744-03-01                 NaN                            NaN  Århus   

   Country Latitude Longitude  
0  Denmark   57.05N    10.33E  
1  Denmark   57.05N    10.33E  
2  Denmark   57.05N    10.33E  
3  Denmark   57.05N    10.33E  
4  Denmark   57.05N    10.33E  
               dt  AverageTemperature  AverageTemperatureUncertainty     City  \
47555  1820-01-01               2.101                          3.217  Abilene   
47556  1820-02-01               6.926                          2.853  Abilene   
47557  1820-03-01              10.767                          2.395  Abilene   
47558  1820-04-01              17

In [17]:
print(min(temp_df['dt']), max(temp_df['dt']))

1743-11-01 2013-09-01


#### The temperature data we have covers 1743 to 2013, while the immigration dataset covers 2016 only. As there is no date overlap between these two, we cannot join on date. We instead take the latest 10 year's of temperature for each city going forward, averaged for each month. 

In [5]:
temp_df['dt'] = pd.to_datetime(temp_df['dt'], format='%Y/%m/%d')

In [6]:
temp_10_years_df = temp_df[temp_df['dt'] >= max(temp_df['dt']) - pd.offsets.DateOffset(years=10)]

In [7]:
temp_10_years_df['month'] = pd.DatetimeIndex(temp_10_years_df['dt']).month

temp_10_years_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude,month
3118,2003-09-01,14.425,0.172,Århus,Denmark,57.05N,10.33E,9
3119,2003-10-01,6.621,0.467,Århus,Denmark,57.05N,10.33E,10
3120,2003-11-01,6.016,0.295,Århus,Denmark,57.05N,10.33E,11
3121,2003-12-01,3.697,0.284,Århus,Denmark,57.05N,10.33E,12
3122,2004-01-01,-0.66,0.204,Århus,Denmark,57.05N,10.33E,1


In [8]:
temp_10_years_df.groupby(['City','Country','Latitude','Longitude','month']).mean().reset_index()

Unnamed: 0,City,Country,Latitude,Longitude,month,AverageTemperature,AverageTemperatureUncertainty
0,A Coruña,Spain,42.59N,8.73W,1,9.1581,0.3537
1,A Coruña,Spain,42.59N,8.73W,2,9.2842,0.3760
2,A Coruña,Spain,42.59N,8.73W,3,11.2129,0.3913
3,A Coruña,Spain,42.59N,8.73W,4,12.6073,0.4009
4,A Coruña,Spain,42.59N,8.73W,5,14.8401,0.3503
5,A Coruña,Spain,42.59N,8.73W,6,17.8176,0.4524
6,A Coruña,Spain,42.59N,8.73W,7,19.1891,0.6247
7,A Coruña,Spain,42.59N,8.73W,8,19.5752,0.4898
8,A Coruña,Spain,42.59N,8.73W,9,18.4079,0.4114
9,A Coruña,Spain,42.59N,8.73W,10,15.2289,0.4015


## Immigration data

In [17]:
fname = '../../data/18-83510-I94-Data-2016/i94_jan16_sub.sas7bdat'
df = pd.read_sas(fname, 'sas7bdat', encoding="ISO-8859-1")

df.head()

(2847924, 28) (3742, 28)


#### Some columns in the immigrant data table are mostly (>95%) nulls, including 'occup', 'entdepu', and 'insnum'. We will disclude these

In [3]:
df.isna().sum()/df.shape[0]*100

cicid        0.000000
i94yr        0.000000
i94mon       0.000000
i94cit       0.000000
i94res       0.000000
i94port      0.000000
arrdate      0.000000
i94mode      0.004941
i94addr      5.171279
depdate      4.414819
i94bir       0.028001
i94visa      0.000000
count        0.000000
dtadfile     0.000063
visapost    60.509611
occup       99.556836
entdepa      0.004878
entdepd      4.333889
entdepu     99.988122
matflag      4.333889
biryear      0.028001
dtaddto      0.018340
gender      11.824817
insnum      95.661360
airline      3.217317
admnum       0.000000
fltno        0.388841
visatype     0.000000
dtype: float64

In [5]:
df.drop(['occup', 'entdepu', 'insnum'], axis=1)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,...,entdepa,entdepd,matflag,biryear,dtaddto,gender,airline,admnum,fltno,visatype
0,2.0,2016.0,3.0,213.0,213.0,XXX,20543.0,,,,...,T,,,1952.0,09292016,,,1.140200e+09,,B2
1,3.0,2016.0,3.0,245.0,245.0,XXX,20524.0,,,,...,T,,,1956.0,09102016,,,1.697295e+09,,B2
2,7.0,2016.0,3.0,260.0,260.0,SFR,20514.0,1.0,CA,20640.0,...,G,O,M,1988.0,08312016,F,PR,7.391007e+10,00104,B2
3,9.0,2016.0,3.0,691.0,691.0,FTL,20533.0,1.0,FL,,...,G,,,1934.0,D/S,M,*GA,8.539319e+10,N900J,F1
4,11.0,2016.0,3.0,258.0,258.0,XXX,20514.0,1.0,NY,20533.0,...,G,O,M,1980.0,08282016,M,AA,8.937570e+10,45,B1
5,12.0,2016.0,3.0,101.0,101.0,NYC,20514.0,1.0,NY,20598.0,...,O,I,M,1979.0,08312016,,TK,8.944788e+10,00003,B1
6,14.0,2016.0,3.0,101.0,101.0,MIA,20514.0,1.0,FL,20544.0,...,G,O,M,1987.0,08312016,M,LH,8.946287e+10,00462,B2
7,15.0,2016.0,3.0,101.0,101.0,NYC,20514.0,1.0,NY,20524.0,...,G,O,M,1993.0,08312016,M,VS,8.948222e+10,00009,B2
8,17.0,2016.0,3.0,101.0,101.0,NYC,20514.0,1.0,NY,,...,G,,,1990.0,08312016,M,TK,8.948743e+10,00011,B2
9,18.0,2016.0,3.0,101.0,101.0,NYC,20514.0,1.0,NY,20525.0,...,G,O,M,1963.0,08312016,M,TK,8.944831e+10,00003,B2


In [4]:
x_df = df[df['i94port'] == 'XXX']
x_df.shape

(1725, 28)