# Exploratory Data Analsis and Project Scope

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [25]:
airports = pd.read_csv('data/airport_codes.csv')
tourists = pd.read_csv('data/immigration_data_sample.csv')
cities = pd.read_csv('data/us_cities_demographics.csv', delimiter=';')

---
## Airports

#### Notes
- Use iata_code as unique identifies. IATA = International Airport Transport Association


#### Extract Conditions
- iso_country = "US"
- type = "large_airport" or "medium_airport"


#### Errors
- Duplicate values
- Missing values - Do not accep

In [20]:
airports.type.unique()

array(['heliport', 'small_airport', 'closed', 'seaplane_base',
       'balloonport', 'medium_airport', 'large_airport'], dtype=object)

In [17]:
usa_large = airports[(airports.type == 'large_airport') & (airports.iso_country == 'US')].copy()
usa_large.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
26007,KABQ,large_airport,Albuquerque International Sunport,5355.0,,US,US-NM,Albuquerque,KABQ,ABQ,ABQ,"-106.609001, 35.040199"
26026,KADW,large_airport,Joint Base Andrews,280.0,,US,US-MD,Camp Springs,KADW,ADW,ADW,"-76.866997, 38.810799"
26039,KAFW,large_airport,Fort Worth Alliance Airport,722.0,,US,US-TX,Fort Worth,KAFW,AFW,AFW,"-97.31880187990001, 32.9875984192"
26043,KAGS,large_airport,Augusta Regional At Bush Field,144.0,,US,US-GA,Augusta,KAGS,AGS,AGS,"-81.9645004272461, 33.36989974975586"
26076,KAMA,large_airport,Rick Husband Amarillo International Airport,3607.0,,US,US-TX,Amarillo,KAMA,AMA,AMA,"-101.706001, 35.219398"


In [18]:
usa_large.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 26007 to 50032
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ident         170 non-null    object 
 1   type          170 non-null    object 
 2   name          170 non-null    object 
 3   elevation_ft  167 non-null    float64
 4   continent     0 non-null      object 
 5   iso_country   170 non-null    object 
 6   iso_region    170 non-null    object 
 7   municipality  169 non-null    object 
 8   gps_code      167 non-null    object 
 9   iata_code     167 non-null    object 
 10  local_code    167 non-null    object 
 11  coordinates   170 non-null    object 
dtypes: float64(1), object(11)
memory usage: 17.3+ KB


#### USA Medium Aiports

In [21]:
usa_medium = airports[(airports.type == 'medium_airport') & (airports.iso_country == 'US')].copy()
usa_medium.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
6188,5A8,medium_airport,Aleknagik / New Airport,66.0,,US,US-AK,Aleknagik,5A8,WKK,5A8,"-158.617996216, 59.2826004028"
25825,K79J,medium_airport,South Alabama Regional At Bill Benton Field Ai...,310.0,,US,US-AL,Andalusia/Opp,K79J,,79J,"-86.393799, 31.3088"
26005,KABE,medium_airport,Lehigh Valley International Airport,393.0,,US,US-PA,Allentown,KABE,ABE,ABE,"-75.44080352783203, 40.652099609375"
26006,KABI,medium_airport,Abilene Regional Airport,1791.0,,US,US-TX,Abilene,KABI,ABI,ABI,"-99.68190002440001, 32.4113006592"
26008,KABR,medium_airport,Aberdeen Regional Airport,1302.0,,US,US-SD,Aberdeen,KABR,ABR,ABR,"-98.42179870605469, 45.449100494384766"


In [22]:
usa_medium.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 692 entries, 6188 to 49895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ident         692 non-null    object 
 1   type          692 non-null    object 
 2   name          692 non-null    object 
 3   elevation_ft  689 non-null    float64
 4   continent     0 non-null      object 
 5   iso_country   692 non-null    object 
 6   iso_region    692 non-null    object 
 7   municipality  688 non-null    object 
 8   gps_code      686 non-null    object 
 9   iata_code     653 non-null    object 
 10  local_code    686 non-null    object 
 11  coordinates   692 non-null    object 
dtypes: float64(1), object(11)
memory usage: 70.3+ KB


---
## USA Cities Demographics

#### Notes
- Can I join the `cities` dataset to `airports` dataset by the `City` feature  to the `municipality` feature?

In [13]:
cities.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [14]:
cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   City                    2891 non-null   object 
 1   State                   2891 non-null   object 
 2   Median Age              2891 non-null   float64
 3   Male Population         2888 non-null   float64
 4   Female Population       2888 non-null   float64
 5   Total Population        2891 non-null   int64  
 6   Number of Veterans      2878 non-null   float64
 7   Foreign-born            2878 non-null   float64
 8   Average Household Size  2875 non-null   float64
 9   State Code              2891 non-null   object 
 10  Race                    2891 non-null   object 
 11  Count                   2891 non-null   int64  
dtypes: float64(6), int64(2), object(4)
memory usage: 271.2+ KB


---
## Tourists

In [26]:
tourists.head()

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,...,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,...,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,...,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,...,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,...,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT


In [27]:
tourists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 29 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1000 non-null   int64  
 1   cicid       1000 non-null   float64
 2   i94yr       1000 non-null   float64
 3   i94mon      1000 non-null   float64
 4   i94cit      1000 non-null   float64
 5   i94res      1000 non-null   float64
 6   i94port     1000 non-null   object 
 7   arrdate     1000 non-null   float64
 8   i94mode     1000 non-null   float64
 9   i94addr     941 non-null    object 
 10  depdate     951 non-null    float64
 11  i94bir      1000 non-null   float64
 12  i94visa     1000 non-null   float64
 13  count       1000 non-null   float64
 14  dtadfile    1000 non-null   int64  
 15  visapost    382 non-null    object 
 16  occup       4 non-null      object 
 17  entdepa     1000 non-null   object 
 18  entdepd     954 non-null    object 
 19  entdepu     0 non-null      

In [28]:
tourists.iloc[:5, 8:20]

Unnamed: 0,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu
0,1.0,HI,20573.0,61.0,2.0,1.0,20160422,,,G,O,
1,1.0,TX,20568.0,26.0,2.0,1.0,20160423,MTR,,G,R,
2,1.0,FL,20571.0,76.0,2.0,1.0,20160407,,,G,O,
3,1.0,CA,20581.0,25.0,2.0,1.0,20160428,DOH,,G,O,
4,3.0,NY,20553.0,19.0,2.0,1.0,20160406,,,Z,K,
