# Exploratory Data Analsis and Project Scope

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import utils

from datetime import datetime

from sql_queries import *

### General Reference

In [None]:
airports = pd.read_csv('data/airport_codes.csv')

In [None]:
start_epoch = pd.Timestamp('1960-1-1')

In [None]:
with open('./data/I94_SAS_Labels_Descriptions.sas') as f:
    f_content = f.read()
    f_content = f_content.replace("\t", "")

In [None]:
type(f_content)

---
## Airports

#### Notes
- Use iata_code as unique identifies. IATA = International Airport Transport Association


#### Extract Conditions
- iso_country = "US"
- type = "large_airport" or "medium_airport"


#### Errors
- Duplicate values
- Missing values - Do not accep

In [None]:
airports.type.unique()

In [None]:
usa_airports = airports[airports.iso_country == 'US'].copy()
usa_airports.dropna(subset=['iata_code'], how='any', inplace=True)
usa_airports.head()

In [None]:
utils.plot_null_val_heatmap(usa_airports, "USA Airports - Null Value", (15,6))

---
## USA Cities Demographics

#### Notes
- Can I join the `cities` dataset to `airports` dataset by the `City` feature  to the `municipality` feature?
- There are 3 missing states - Vermont, West Virginia, and Wyoming.
- The dataset contains information about the demographics of all US cities and census-designated places with a population greater or equal to 65,000.
- Should I scrape the census for additional data?


In [None]:
cities = pd.read_csv('data/us_cities_demographics.csv', delimiter=';')
cities.tail()

In [None]:
cities.columns = ["city", "state", "median_age", "male_pop", "female_pop", "total_pop", "num_veterans", "num_foreigners",
                  "avg_hh_size", "state_code", "race", "count"]
cities.tail()

In [None]:
cities.sort_values(["state", "city", "count"], axis=0, inplace=True)

In [None]:
cities2 = cities.drop_duplicates(subset=["state", "city"],
                                keep='last')

In [None]:
sum(cities2['total_pop']) == sum(cities['count']) 

In [None]:
sum(cities2['total_pop'])

In [None]:
sum(cities['count']) 

---
## Visits

Need to convert countries to name

In [None]:
visits = pd.read_csv('data/immigration_data_sample.csv')

visits.rename({'Unnamed: 0': 'visit_id'}, axis=1, inplace=True)

In [None]:
# visits.iloc[:, :30]

In [None]:
visits.info()

In [None]:
visits['arrdate'] = pd.to_timedelta(visits.arrdate, unit='D') + start_epoch
visits['depdate'] = pd.to_timedelta(visits.depdate, unit='D') + start_epoch
visits['biryear'] = visits.biryear.apply(int)

In [None]:
#feats_to_drop =["insnum", "dtadfile", "fltno", 'i94bir', "occup", "admnum", "entdepu", "visapost"]
#visits.drop(feats_to_drop, axis=1, inplace=True)

In [None]:
visits.head()

In [None]:
utils.plot_null_val_heatmap(visits, "Tourist Visits - Null Value", (20,8))

In [None]:
visits[visits.matflag!="M"].head()

In [None]:
set1 = set(visits.entdepa.unique())
set2 = set(visits.entdepd.unique())

set1.union(set2)

In [None]:
set(visits.entdepd)

In [None]:
set(visits.entdepa)

In [None]:
for col in visits.columns:
    print(f"{col}: {visits[col].nunique()}")

In [None]:
visits.matflag.unique()

---
## Global Temperatures

In [None]:
weather = pd.read_csv('data/GlobalLandTemperaturesByCity.csv')
weather = weather[weather.Country == 'United States'].copy()
weather.tail()

In [None]:
num_unique_usa_cities = len(weather.City.unique())
print(f"# of unique US Cities: {num_unique_usa_cities}")

In [None]:
print("Earliest date: ", weather.dt.min())
print("Latest date: ", weather.dt.max())

In [None]:
weather = pd.read_csv('temperatures_by_city.csv')
weather = weather[weather.Country == 'United States'].copy()

print("Earliest date: ", weather.dt.min())
print("Latest date: ", weather.dt.max())

---
## Aggregation

In [None]:
labels = pd.read_sas('data/I94_SAS_Labels_Descriptions.sas', 'sas7bdat', encoding="ISO-8859-1")

In [None]:
# Number of cities that exist in BOTH "cities" and "usa_med_large" dfs.
# "cities" has a total of 2,891 cities.
len(set(cities.City).intersection(set(usa_med_large.municipality)))