The Scope of this project is to clean and upload data for further analysis of US immigration with US Demographic and US Temperature.

#### Importing the libraries for carrying out the scope of poject work

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import os
import glob
import sys
import datetime
import matplotlib

from datetime import datetime,timedelta
from sql_queries import *
from sql_queries import airport_insert, demographic_insert, immigration_insert, temperature_insert

#### Setting the display parameters for the dataframe columns and rows.

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

pd.set_option('display.max_rows', None) 
pd.set_option('display.max_rows', 100)

#### Setting the Decimal values to Zero digits.

In [3]:
pd.set_option('display.float_format', lambda x:'%.f'% x)

## Gathering the data for the project.

In [4]:
# The under said datasets were provided by Udacity. And I used the dataset as itis for my project.

"""This is the sample dataset of US Immigration provided and also in parquet file as 'sas-data', 
which will be used later in this project."""
df_immigration = pd.read_parquet("sas_data")

# This is the additional dataset of US Demographics provided by Udacity
df_demographics = pd.read_csv("us-cities-demographics.csv", delimiter=";")

# This is the dataset with respect to Airport Codes used world wide and is provided by Udacity. 
df_airportcodes = pd.read_csv("airport-codes_csv.csv")

In [5]:
"""The below said Dataset was downloaded from Kaggle.The download URL is 
https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data. 
This Dataset will be used for the project."""
df_temperature = pd.read_csv('GlobalLandTemperaturesByCity.csv')

## Describing the Data gathered for the project.

#### Checking the Shape of the above taken Dataset, so that we can get the outline feel of the data we are going to work.

In [6]:
# Will create dataframe of collected data and check the shape of data gathered.

print('Shape of immigration -',df_immigration.shape)
print('Shape of demographics -',df_demographics.shape)
print('Shape of airportcodes -', df_airportcodes.shape)
print('Shape of temperature -', df_temperature.shape)

Shape of immigration - (3096313, 28)
Shape of demographics - (2891, 12)
Shape of airportcodes - (55075, 12)
Shape of temperature - (8599212, 7)


## Explore and Assess the Data

The two immigration dataset provided by Udacity one csv sample file and other parquet file.We are checking the columns matching. As we checked the shape of data. We are using sas_data parquet file.

In [7]:
df_immigration.columns

Index(['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port', 'arrdate',
       'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa', 'count',
       'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu',
       'matflag', 'biryear', 'dtaddto', 'gender', 'insnum', 'airline',
       'admnum', 'fltno', 'visatype'],
      dtype='object')

In [8]:
df_immigration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3096313 entries, 0 to 3096312
Data columns (total 28 columns):
 #   Column    Dtype  
---  ------    -----  
 0   cicid     float64
 1   i94yr     float64
 2   i94mon    float64
 3   i94cit    float64
 4   i94res    float64
 5   i94port   object 
 6   arrdate   float64
 7   i94mode   float64
 8   i94addr   object 
 9   depdate   float64
 10  i94bir    float64
 11  i94visa   float64
 12  count     float64
 13  dtadfile  object 
 14  visapost  object 
 15  occup     object 
 16  entdepa   object 
 17  entdepd   object 
 18  entdepu   object 
 19  matflag   object 
 20  biryear   float64
 21  dtaddto   object 
 22  gender    object 
 23  insnum    object 
 24  airline   object 
 25  admnum    float64
 26  fltno     object 
 27  visatype  object 
dtypes: float64(13), object(15)
memory usage: 661.4+ MB


In [9]:
df_immigration.corr()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,arrdate,i94mode,depdate,i94bir,i94visa,count,biryear,admnum
cicid,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,-0.0,0.0
i94yr,,,,,,,,,,,,,
i94mon,,,,,,,,,,,,,
i94cit,0.0,,,1.0,1.0,0.0,-0.0,0.0,-0.0,0.0,,0.0,0.0
i94res,0.0,,,1.0,1.0,0.0,-0.0,0.0,0.0,0.0,,-0.0,0.0
arrdate,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,-0.0,0.0
i94mode,0.0,,,-0.0,-0.0,0.0,1.0,0.0,0.0,0.0,,-0.0,-0.0
depdate,0.0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,-0.0,0.0
i94bir,0.0,,,-0.0,0.0,0.0,0.0,0.0,1.0,-0.0,,-1.0,0.0
i94visa,0.0,,,0.0,0.0,0.0,0.0,0.0,-0.0,1.0,,0.0,-0.0


In [10]:
df_immigration.describe()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,arrdate,i94mode,depdate,i94bir,i94visa,count,biryear,admnum
count,3096313,3096313,3096313,3096313,3096313,3096313,3096074,2953856,3095511,3096313,3096313,3095511,3096313
mean,3078652,2016,4,305,303,20560,1,20574,42,2,1,1974,70828850112
std,1763278,0,0,210,209,9,1,29,17,0,0,17,22154415948
min,6,2016,4,101,101,20545,1,15176,-3,1,1,1902,0
25%,1577790,2016,4,135,131,20552,1,20561,30,2,1,1962,56035228433
50%,3103507,2016,4,213,213,20560,1,20570,41,2,1,1975,59360939033
75%,4654341,2016,4,512,504,20567,1,20579,54,2,1,1986,93509869930
max,6102785,2016,4,999,760,20574,9,45427,114,3,1,2019,99915565930


In [11]:
# We can checking the missing data from the data set in percentage.
(df_immigration.isnull().sum() / len(df_immigration))*100 

cicid        0
i94yr        0
i94mon       0
i94cit       0
i94res       0
i94port      0
arrdate      0
i94mode      0
i94addr      5
depdate      5
i94bir       0
i94visa      0
count        0
dtadfile     0
visapost    61
occup      100
entdepa      0
entdepd      4
entdepu    100
matflag      4
biryear      0
dtaddto      0
gender      13
insnum      96
airline      3
admnum       0
fltno        1
visatype     0
dtype: float64

In [12]:
df_immigration.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,6,2016,4,692,692,XXX,20573,,,,37,2,1,,,,T,,U,,1979,10282016,,,,1897628485,,B2
1,7,2016,4,254,276,ATL,20551,1.0,AL,,25,3,1,20130811.0,SEO,,G,,Y,,1991,D/S,M,,,3736796330,296.0,F1
2,15,2016,4,101,101,WAS,20545,1.0,MI,20691.0,55,2,1,20160401.0,,,T,O,,M,1961,09302016,M,,OS,666643185,93.0,B2
3,16,2016,4,101,101,NYC,20545,1.0,MA,20567.0,28,2,1,20160401.0,,,O,O,,M,1988,09302016,,,AA,92468461330,199.0,B2
4,17,2016,4,101,101,NYC,20545,1.0,MA,20567.0,4,2,1,20160401.0,,,O,O,,M,2012,09302016,,,AA,92468463130,199.0,B2


In [13]:
df_immigration.tail()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
3096308,625229,2016,4,745,745,SYS,20547,3,CA,,36,2,1,20160403,,,Z,,,,1980,5082016,,,,78934563730,00066,B2
3096309,1972204,2016,4,745,745,SYS,20554,3,CA,20555.0,36,2,1,20160410,BLG,,Z,Q,,M,1980,9102016,F,,,90300538230,00066,B2
3096310,4249448,2016,4,745,745,TEC,20566,3,VA,20588.0,23,2,1,20160422,BLG,,Z,O,,M,1993,9202016,F,,,91416719230,00651,B2
3096311,5658953,2016,4,748,748,NEW,20573,3,MN,,57,2,1,20160429,CLG,,Z,,,,1959,10282016,M,,,94887095530,LAND,B2
3096312,3106671,2016,4,123,749,NOG,20561,3,AZ,20567.0,58,1,1,20160417,,,Z,O,,M,1958,7102016,M,,,56056868133,00866,WB


In [14]:
df_immigration.sample(5)

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
2233871,4526415,2016,4,209,209,LOS,20568,1,CA,20571,37,1,1,20160424,,,O,O,,M,1979,7222016,,,AA,59215333033,26,WB
441203,880358,2016,4,268,268,SFR,20549,1,CA,20554,35,1,1,20160405,,,G,O,,M,1981,7032016,M,,BR,55684688733,18,WB
2039246,4099664,2016,4,245,245,LOS,20566,1,CA,20575,34,2,1,20160422,SHG,,G,O,,M,1982,10212016,M,,CA,94279807530,983,B2
2713048,5468788,2016,4,148,112,PHI,20573,1,FL,20594,67,2,1,20160429,MUN,,G,O,,M,1949,10282016,F,,AA,94871713930,717,B2
2991540,6016762,2016,4,254,276,CHI,20559,1,,20563,2,2,1,20160617,,,A,D,,M,2014,5292016,F,3974.0,7C,49239230433,3402,GMT


US Demoraphic dataset provided by Udacity one csv sample file.We are checking the columns matching. As we checked the shape of data.

In [15]:
df_demographics.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Count'],
      dtype='object')

In [16]:
df_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   City                    2891 non-null   object 
 1   State                   2891 non-null   object 
 2   Median Age              2891 non-null   float64
 3   Male Population         2888 non-null   float64
 4   Female Population       2888 non-null   float64
 5   Total Population        2891 non-null   int64  
 6   Number of Veterans      2878 non-null   float64
 7   Foreign-born            2878 non-null   float64
 8   Average Household Size  2875 non-null   float64
 9   State Code              2891 non-null   object 
 10  Race                    2891 non-null   object 
 11  Count                   2891 non-null   int64  
dtypes: float64(6), int64(2), object(4)
memory usage: 271.2+ KB


In [17]:
df_demographics.corr()

Unnamed: 0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
Median Age,1,0,0,0,0,0,0,0
Male Population,0,1,1,1,1,1,0,1
Female Population,0,1,1,1,1,1,0,1
Total Population,0,1,1,1,1,1,0,1
Number of Veterans,0,1,1,1,1,1,0,1
Foreign-born,0,1,1,1,1,1,0,1
Average Household Size,0,0,0,0,0,0,1,0
Count,0,1,1,1,1,1,0,1


In [18]:
df_demographics.describe()

Unnamed: 0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
count,2891,2888,2888,2891,2878,2878,2875,2891
mean,35,97328,101770,198967,9368,40654,3,48964
std,4,216300,231565,447556,13211,155749,0,144386
min,23,29281,27348,63215,416,861,2,98
25%,33,39289,41227,80429,3739,9224,2,3435
50%,35,52341,53809,106782,5397,18822,3,13780
75%,38,86642,89604,175232,9368,33972,3,54447
max,70,4081698,4468707,8550405,156961,3212500,5,3835726


In [19]:
(df_demographics.isnull().sum()/len(df_demographics))*100

City                     0
State                    0
Median Age               0
Male Population          0
Female Population        0
Total Population         0
Number of Veterans       0
Foreign-born             0
Average Household Size   1
State Code               0
Race                     0
Count                    0
dtype: float64

In [20]:
df_demographics.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,34,40601,41862,82463,1562,30908,3,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41,44129,49500,93629,4147,32935,2,MA,White,58723
2,Hoover,Alabama,38,38040,46799,84839,4819,8229,3,AL,Asian,4759
3,Rancho Cucamonga,California,34,88127,87105,175232,5821,33878,3,CA,Black or African-American,24437
4,Newark,New Jersey,35,138040,143873,281913,5829,86253,3,NJ,White,76402


In [21]:
df_demographics.tail()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
2886,Stockton,California,32,150976,154674,305650,12822,79583,3,CA,American Indian and Alaska Native,19834
2887,Southfield,Michigan,42,31369,41808,73177,4035,4011,2,MI,American Indian and Alaska Native,983
2888,Indianapolis,Indiana,34,410615,437808,848423,42186,72456,3,IN,White,553665
2889,Somerville,Massachusetts,31,41028,39306,80334,2103,22292,2,MA,American Indian and Alaska Native,374
2890,Coral Springs,Florida,37,63316,66186,129502,4724,38552,3,FL,White,90896


In [22]:
df_demographics.sample(5)

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
320,Union City,California,38,38599,35911,74510,1440,32752,3,CA,White,16845
1449,Medford,Oregon,39,39606,40189,79795,6632,6185,3,OR,American Indian and Alaska Native,1239
1868,Alafaya,Florida,34,39504,45760,85264,4176,15842,3,FL,Black or African-American,6577
2709,Arvada,Colorado,41,54870,60165,115035,8930,4921,2,CO,Asian,2922
2819,Aurora,Colorado,34,177899,180971,358870,25158,65816,3,CO,Black or African-American,61894


Airport Codes dataset provided by Udacity one csv sample file.We are checking the columns matching. As we checked the shape of data.

In [23]:
df_airportcodes.columns

Index(['ident', 'type', 'name', 'elevation_ft', 'continent', 'iso_country',
       'iso_region', 'municipality', 'gps_code', 'iata_code', 'local_code',
       'coordinates'],
      dtype='object')

In [24]:
df_airportcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55075 entries, 0 to 55074
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ident         55075 non-null  object 
 1   type          55075 non-null  object 
 2   name          55075 non-null  object 
 3   elevation_ft  48069 non-null  float64
 4   continent     27356 non-null  object 
 5   iso_country   54828 non-null  object 
 6   iso_region    55075 non-null  object 
 7   municipality  49399 non-null  object 
 8   gps_code      41030 non-null  object 
 9   iata_code     9189 non-null   object 
 10  local_code    28686 non-null  object 
 11  coordinates   55075 non-null  object 
dtypes: float64(1), object(11)
memory usage: 5.0+ MB


In [25]:
df_airportcodes.corr()

Unnamed: 0,elevation_ft
elevation_ft,1


In [26]:
df_airportcodes.describe()

Unnamed: 0,elevation_ft
count,48069
mean,1241
std,1602
min,-1266
25%,205
50%,718
75%,1497
max,22000


In [27]:
(df_airportcodes.isnull().sum()/len(df_airportcodes))*100

ident           0
type            0
name            0
elevation_ft   13
continent      50
iso_country     0
iso_region      0
municipality   10
gps_code       26
iata_code      83
local_code     48
coordinates     0
dtype: float64

In [28]:
df_airportcodes.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [29]:
df_airportcodes.tail()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
55070,ZYYK,medium_airport,Yingkou Lanqi Airport,0.0,AS,CN,CN-21,Yingkou,ZYYK,YKH,,"122.3586, 40.542524"
55071,ZYYY,medium_airport,Shenyang Dongta Airport,,AS,CN,CN-21,Shenyang,ZYYY,,,"123.49600219726562, 41.784400939941406"
55072,ZZ-0001,heliport,Sealand Helipad,40.0,EU,GB,GB-ENG,Sealand,,,,"1.4825, 51.894444"
55073,ZZ-0002,small_airport,Glorioso Islands Airstrip,11.0,AF,TF,TF-U-A,Grande Glorieuse,,,,"47.296388888900005, -11.584277777799999"
55074,ZZZZ,small_airport,Satsuma IÅjima Airport,338.0,AS,JP,JP-46,Mishima-Mura,RJX7,,,"130.270556, 30.784722"


In [30]:
df_airportcodes.sample(5)

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
3649,30AL,heliport,Med Flight 2 Heliport,650,,US,US-AL,Danville,30AL,,30AL,"-87.0805, 34.373001"
30760,LDSM,closed,Lumbarda Seaplane Terminal,1,EU,HR,HR-19,Lumbarda,LDSM,,,"17.17049, 42.923159"
31410,LFYR,small_airport,Romorantin Pruniers Airfield,289,EU,FR,FR-F,,LFYR,,,"1.691036, 47.317543"
8586,7Y7,closed,A.R.S. Sport Strip,955,,US,US-MN,Belle Plaine,,,,"-93.783602, 44.666598"
11080,AR-0317,heliport,Estancia Rio Ewan Heliport,112,SA,AR,AR-V,RÃ­o Grande,,,HRW,"-67.213246, -54.233548"


Global City Temperature dataset from Kaggle site in csv file formate.We are checking the columns matching. As we checked the shape of data.

In [31]:
df_temperature.columns

Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City',
       'Country', 'Latitude', 'Longitude'],
      dtype='object')

In [32]:
df_temperature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [33]:
df_temperature.corr()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
AverageTemperature,1,0
AverageTemperatureUncertainty,0,1


In [34]:
df_temperature.describe()

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty
count,8235082,8235082
mean,17,1
std,10,1
min,-43,0
25%,10,0
50%,19,1
75%,25,1
max,40,15


In [35]:
(df_temperature.isnull().sum()/len(df_temperature))*100

dt                              0
AverageTemperature              4
AverageTemperatureUncertainty   4
City                            0
Country                         0
Latitude                        0
Longitude                       0
dtype: float64

In [36]:
df_temperature.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.0,2.0,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [37]:
df_temperature.tail()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
8599207,2013-05-01,11.0,0.0,Zwolle,Netherlands,52.24N,5.26E
8599208,2013-06-01,15.0,0.0,Zwolle,Netherlands,52.24N,5.26E
8599209,2013-07-01,19.0,0.0,Zwolle,Netherlands,52.24N,5.26E
8599210,2013-08-01,18.0,0.0,Zwolle,Netherlands,52.24N,5.26E
8599211,2013-09-01,,,Zwolle,Netherlands,52.24N,5.26E


In [38]:
df_temperature.sample(5)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
7170669,1959-04-01,8.0,0.0,Stavropol,Russia,44.20N,42.48E
385109,1845-07-01,,,Araruama,Brazil,23.31S,42.82W
8450953,1985-10-01,12.0,0.0,Yuncheng,China,34.56N,110.92E
5739831,2011-05-01,17.0,0.0,Paterson,United States,40.99N,74.56W
152145,1964-02-01,23.0,0.0,Akyab,Burma,20.09N,92.13E


## Cleaning the Data

Cleaning immigration data - After cleaning we will make new dataframe for immigration data as us_immigration.

In [39]:
us_immigration = pd.DataFrame(df_immigration, columns = ['cicid','admnum','arrdate','i94port','i94addr',
                                                         'i94mode','visatype','i94visa','airline','biryear',
                                                         'i94bir', 'gender','visapost','occup','depdate'])

In [40]:
us_immigration.shape

(3096313, 15)

In [41]:
us_immigration.head()

Unnamed: 0,cicid,admnum,arrdate,i94port,i94addr,i94mode,visatype,i94visa,airline,biryear,i94bir,gender,visapost,occup,depdate
0,6,1897628485,20573,XXX,,,B2,2,,1979,37,,,,
1,7,3736796330,20551,ATL,AL,1.0,F1,3,,1991,25,M,SEO,,
2,15,666643185,20545,WAS,MI,1.0,B2,2,OS,1961,55,M,,,20691.0
3,16,92468461330,20545,NYC,MA,1.0,B2,2,AA,1988,28,,,,20567.0
4,17,92468463130,20545,NYC,MA,1.0,B2,2,AA,2012,4,,,,20567.0


In [42]:
us_immigration['arrdate'] = pd.to_datetime(us_immigration['arrdate'],origin='1960-1-1',unit='D')
us_immigration['depdate'] = pd.to_datetime(us_immigration['depdate'],origin='1960-1-1',unit='D')

In [43]:
us_immigration.tail()

Unnamed: 0,cicid,admnum,arrdate,i94port,i94addr,i94mode,visatype,i94visa,airline,biryear,i94bir,gender,visapost,occup,depdate
3096308,625229,78934563730,2016-04-03,SYS,CA,3,B2,2,,1980,36,,,,NaT
3096309,1972204,90300538230,2016-04-10,SYS,CA,3,B2,2,,1980,36,F,BLG,,2016-04-11
3096310,4249448,91416719230,2016-04-22,TEC,VA,3,B2,2,,1993,23,F,BLG,,2016-05-14
3096311,5658953,94887095530,2016-04-29,NEW,MN,3,B2,2,,1959,57,M,CLG,,NaT
3096312,3106671,56056868133,2016-04-17,NOG,AZ,3,WB,1,,1958,58,M,,,2016-04-23


In [44]:
us_immigration[us_immigration.visatype.isnull()] # Checking the data of column visatype
us_immigration['visatype'].value_counts().head()

WT     1309059
B2     1117897
WB      282983
B1      212410
GMT      89133
Name: visatype, dtype: int64

In [45]:
us_immigration[us_immigration.i94visa.isnull()] # Checking the data of column visatype
us_immigration['i94visa'].value_counts().head(15)

2    2530868
1     522079
3      43366
Name: i94visa, dtype: int64

In [46]:
us_immigration['i94visa']=us_immigration['i94visa'].astype(str)

In [47]:
us_immigration['i94visa'] = us_immigration['i94visa'].str.replace('1.0','Business')
us_immigration['i94visa'] = us_immigration['i94visa'].str.replace('2.0','Pleasure')
us_immigration['i94visa'] = us_immigration['i94visa'].str.replace('3.0','Student')

  us_immigration['i94visa'] = us_immigration['i94visa'].str.replace('1.0','Business')
  us_immigration['i94visa'] = us_immigration['i94visa'].str.replace('2.0','Pleasure')
  us_immigration['i94visa'] = us_immigration['i94visa'].str.replace('3.0','Student')


In [48]:
us_immigration['i94mode'].value_counts().head()

1    2994505
3      66660
2      26349
9       8560
Name: i94mode, dtype: int64

In [49]:
us_immigration['i94mode']=us_immigration['i94mode'].astype(str)

In [50]:
us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('1.0','Air')
us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('2.0','Sea')
us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('3.0','Land')
us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('9.0','NotReported')

  us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('1.0','Air')
  us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('2.0','Sea')
  us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('3.0','Land')
  us_immigration['i94mode'] = us_immigration['i94mode'].str.replace('9.0','NotReported')


In [51]:
# Checking the data of column visatype
us_immigration['gender'].value_counts().head()

M    1377224
F    1302743
X       1610
U        467
Name: gender, dtype: int64

In [52]:
us_immigration['gender'] = us_immigration['gender'].str.replace('X','O')
us_immigration['gender'] = us_immigration['gender'].str.replace('U','NotReported')

In [53]:
# Checking the data of column visatype
us_immigration['occup'].value_counts().head()

STU    4719
OTH     661
NRR     345
MKT     280
EXA     196
Name: occup, dtype: int64

In [54]:
us_immigration.drop_duplicates(inplace=True)

In [55]:
us_immigration.shape

(3096313, 15)

In [56]:
us_immigration = us_immigration.rename(columns={'cicid':'TravellerID','admnum':'AdmissionNo','arrdate':'ArrivalDate',
                                                'i94port':'PortOfEntry','i94addr':'StateCode',
                                                'i94res':'CountryCode','i94mode':'ModeOfTravel',
                                                'visatype':'TypeOfVisa','i94visa':'Reason',
                                                'airline':'Airline','biryear':'BirthYear',
                                                'i94bir':'Age', 'gender':'Gender','visapost':'IataCode',
                                                'occup':'Occupation','depdate':'DateOfDepature'})

In [57]:
us_immigration.head()

Unnamed: 0,TravellerID,AdmissionNo,ArrivalDate,PortOfEntry,StateCode,ModeOfTravel,TypeOfVisa,Reason,Airline,BirthYear,Age,Gender,IataCode,Occupation,DateOfDepature
0,6,1897628485,2016-04-29,XXX,,,B2,Pleasure,,1979,37,,,,NaT
1,7,3736796330,2016-04-07,ATL,AL,Air,F1,Student,,1991,25,M,SEO,,NaT
2,15,666643185,2016-04-01,WAS,MI,Air,B2,Pleasure,OS,1961,55,M,,,2016-08-25
3,16,92468461330,2016-04-01,NYC,MA,Air,B2,Pleasure,AA,1988,28,,,,2016-04-23
4,17,92468463130,2016-04-01,NYC,MA,Air,B2,Pleasure,AA,2012,4,,,,2016-04-23


Cleaning Demographics data - After cleaning we will make new dataframe for demographic data as us_demographics.

In [58]:
us_demographics = pd.DataFrame(df_demographics, columns = ['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Number of Veterans', 'Foreign-born','Average Household Size', 'State Code', 'Race', 'Count'])

In [59]:
us_demographics = us_demographics[['State Code','State', 'City', 'Race',
                                  'Median Age', 'Male Population', 'Female Population',
                                  'Number of Veterans', 'Foreign-born','Average Household Size','Count']]

In [60]:
us_demographics.sample(5)

Unnamed: 0,State Code,State,City,Race,Median Age,Male Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,Count
2098,LA,Louisiana,Shreveport,White,35,93138,103856,14287,5658,3,79319
1329,IL,Illinois,Decatur,Asian,40,34646,38210,5291,1224,2,1343
1773,CA,California,Chino Hills,American Indian and Alaska Native,42,39639,38674,3358,24764,3,1773
1643,MA,Massachusetts,New Bedford,American Indian and Alaska Native,39,43793,51166,4185,19024,2,346
119,MA,Massachusetts,Newton,Hispanic or Latino,42,41985,46824,1814,21692,3,4790


In [61]:
us_demographics.dropna()

Unnamed: 0,State Code,State,City,Race,Median Age,Male Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,Count
0,MD,Maryland,Silver Spring,Hispanic or Latino,34,40601,41862,1562,30908,3,25924
1,MA,Massachusetts,Quincy,White,41,44129,49500,4147,32935,2,58723
2,AL,Alabama,Hoover,Asian,38,38040,46799,4819,8229,3,4759
3,CA,California,Rancho Cucamonga,Black or African-American,34,88127,87105,5821,33878,3,24437
4,NJ,New Jersey,Newark,White,35,138040,143873,5829,86253,3,76402
...,...,...,...,...,...,...,...,...,...,...,...
2886,CA,California,Stockton,American Indian and Alaska Native,32,150976,154674,12822,79583,3,19834
2887,MI,Michigan,Southfield,American Indian and Alaska Native,42,31369,41808,4035,4011,2,983
2888,IN,Indiana,Indianapolis,White,34,410615,437808,42186,72456,3,553665
2889,MA,Massachusetts,Somerville,American Indian and Alaska Native,31,41028,39306,2103,22292,2,374


In [62]:
us_demographics[us_demographics[['City','State', 'State Code', 'Race']].duplicated()]

Unnamed: 0,State Code,State,City,Race,Median Age,Male Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,Count


In [63]:
us_demographics[us_demographics.City=='Los Angeles']

Unnamed: 0,State Code,State,City,Race,Median Age,Male Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,Count
40,CA,California,Los Angeles,White,35,1958998,2012898,85417,1485425,3,2177650
535,CA,California,Los Angeles,Asian,35,1958998,2012898,85417,1485425,3,512999
1714,CA,California,Los Angeles,American Indian and Alaska Native,35,1958998,2012898,85417,1485425,3,63758
2392,CA,California,Los Angeles,Hispanic or Latino,35,1958998,2012898,85417,1485425,3,1936732
2883,CA,California,Los Angeles,Black or African-American,35,1958998,2012898,85417,1485425,3,404868


In [64]:
us_demographics.shape

(2891, 11)

Cleaning Airport Codes data

In [65]:
df_airportcodes.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [66]:
df_airportcodes[df_airportcodes.type.isnull()] # Checking the data of column type of Airports
df_airportcodes['type'].value_counts().head(10)

small_airport     33965
heliport          11287
medium_airport     4550
closed             3606
seaplane_base      1016
large_airport       627
balloonport          24
Name: type, dtype: int64

In [67]:
df_airportcodes['stateCode'] = df_airportcodes.iso_region.str.slice(start=3)
df_airportcodes.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates,stateCode
0,00A,heliport,Total Rf Heliport,11,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125",PA
1,00AA,small_airport,Aero B Ranch Airport,3435,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022",KS
2,00AK,small_airport,Lowell Field,450,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968",AK
3,00AL,small_airport,Epps Airpark,820,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172",AL
4,00AR,closed,Newport Hospital & Clinic Heliport,237,,US,US-AR,Newport,,,,"-91.254898, 35.6087",AR


In [68]:
latitude = []
longitude = []

for row in df_airportcodes['coordinates']:
    
    try:
        latitude.append(row.split(',')[0])
        longitude.append(row.split(',')[1])
    except:
        latitude.append(np.NaN)
        longitude.append(np.NaN)


df_airportcodes['latitude'] = latitude
df_airportcodes['longitude'] = longitude

In [69]:
df_airportcodes.drop(columns = ['iso_region','coordinates'], inplace=True)

In [70]:
df_airportcodes.sample()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,municipality,gps_code,iata_code,local_code,stateCode,latitude,longitude
36569,NTGC,medium_airport,Tikehau Airport,6,OC,PF,,NTGC,TIH,,U-A,-148.2310028076172,-15.119600296020508


In [71]:
df_airportcodes = df_airportcodes[['ident','iso_country','stateCode','continent','municipality',
                                  'iata_code','name','type','elevation_ft','latitude','longitude',
                                  'gps_code','local_code']]

In [72]:
df_airportcodes.head()

Unnamed: 0,ident,iso_country,stateCode,continent,municipality,iata_code,name,type,elevation_ft,latitude,longitude,gps_code,local_code
0,00A,US,PA,,Bensalem,,Total Rf Heliport,heliport,11,-74.93360137939453,40.07080078125,00A,00A
1,00AA,US,KS,,Leoti,,Aero B Ranch Airport,small_airport,3435,-101.473911,38.704022,00AA,00AA
2,00AK,US,AK,,Anchor Point,,Lowell Field,small_airport,450,-151.695999146,59.94919968,00AK,00AK
3,00AL,US,AL,,Harvest,,Epps Airpark,small_airport,820,-86.77030181884766,34.86479949951172,00AL,00AL
4,00AR,US,AR,,Newport,,Newport Hospital & Clinic Heliport,closed,237,-91.254898,35.6087,,


In [73]:
df_airportcodes.shape

(55075, 13)

In [74]:
df_airportcodes.iso_country.value_counts().shape[0]

243

In [75]:
df_airportcodes.continent.value_counts()

EU    7840
SA    7709
AS    5350
AF    3362
OC    3067
AN      28
Name: continent, dtype: int64

Analyasing US Airport Details

In [76]:
us_airports = df_airportcodes[df_airportcodes.iso_country=='US']

In [77]:
us_airports.shape

(22757, 13)

In [78]:
us_airports.stateCode.value_counts().shape[0]

52

In [79]:
us_airports.sample(5)

Unnamed: 0,ident,iso_country,stateCode,continent,municipality,iata_code,name,type,elevation_ft,latitude,longitude,gps_code,local_code
47536,TA10,US,TX,,Mansfield,,Flying W Heliport,closed,615,-97.134697,32.521801,,
62,00XS,US,TX,,O'Donnell,,L P Askew Farms Airport,small_airport,3110,-101.93399810791016,33.03340148925781,00XS,00XS
28282,KNZX,US,FL,,Harold,,Harold Nolf Heliport,heliport,150,-86.8871994019,30.6807003021,KNZX,NZX
49773,US-0758,US,VA,,Onancock,,Riverside Shore Memorial Hospital Heliport,heliport,42,-75.723176,37.696297,3VG6,3VG6
10120,9NY4,US,NY,,Kennedy,,Kennedy Airfield,small_airport,1760,-79.08180236816406,42.10079956054688,9NY4,9NY4


In [80]:
us_airports[us_airports['iata_code'].isna()].head()

Unnamed: 0,ident,iso_country,stateCode,continent,municipality,iata_code,name,type,elevation_ft,latitude,longitude,gps_code,local_code
0,00A,US,PA,,Bensalem,,Total Rf Heliport,heliport,11,-74.93360137939453,40.07080078125,00A,00A
1,00AA,US,KS,,Leoti,,Aero B Ranch Airport,small_airport,3435,-101.473911,38.704022,00AA,00AA
2,00AK,US,AK,,Anchor Point,,Lowell Field,small_airport,450,-151.695999146,59.94919968,00AK,00AK
3,00AL,US,AL,,Harvest,,Epps Airpark,small_airport,820,-86.77030181884766,34.86479949951172,00AL,00AL
4,00AR,US,AR,,Newport,,Newport Hospital & Clinic Heliport,closed,237,-91.254898,35.6087,,


In [81]:
us_airports.dropna(subset=["iata_code"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_airports.dropna(subset=["iata_code"], inplace=True)


In [82]:
us_airports.head()

Unnamed: 0,ident,iso_country,stateCode,continent,municipality,iata_code,name,type,elevation_ft,latitude,longitude,gps_code,local_code
440,07FA,US,FL,,Key Largo,OCA,Ocean Reef Club Airport,small_airport,8,-80.274803161621,25.325399398804,07FA,07FA
594,0AK,US,AK,,Pilot Station,PQS,Pilot Station Airport,small_airport,305,-162.899994,61.934601,,0AK
673,0CO2,US,CO,,Crested Butte,CSE,Crested Butte Airpark,small_airport,8980,-106.928341,38.851918,0CO2,0CO2
1088,0TE7,US,TX,,Johnson City,JCY,LBJ Ranch Airport,small_airport,1515,-98.6224975586,30.251800537100003,0TE7,0TE7
1402,13MA,US,MA,,Palmer,PMX,Metropolitan Airport,small_airport,418,-72.31140136719999,42.2233009338,13MA,13MA


Back to df_airportcodes

In [83]:
df_airportcodes.dropna(subset=["iata_code"], inplace=True)

In [84]:
df_airportcodes.shape

(9189, 13)

In [85]:
df_airportcodes = df_airportcodes.rename(columns={'ident':'AirportID','iso_country':'CountryCode',
                                                   'stateCode':'StateCode','continent':'ContinentCode',
                                                   'municipality':'Municipality','iata_code':'IATACode',
                                                   'name':'AirportName','type':'TypeOfAirport',
                                                   'elevation_ft':'ElevationInFeet','latitude':'Latitude',
                                                   'longitude':'Longitude','gps_code':'GPSCode',
                                                   'local_code':'LocalCode'})

In [86]:
df_airportcodes = df_airportcodes[['IATACode','AirportID','CountryCode', 'StateCode', 'ContinentCode',
                                   'Municipality', 'AirportName', 'TypeOfAirport','ElevationInFeet',
                                   'Latitude', 'Longitude', 'GPSCode', 'LocalCode']]

In [87]:
# df_airportcodes = df_airportcodes.astype(str)

In [88]:
df_airportcodes.drop_duplicates(subset=['IATACode'], inplace=True)
df_airportcodes.head(5)

Unnamed: 0,IATACode,AirportID,CountryCode,StateCode,ContinentCode,Municipality,AirportName,TypeOfAirport,ElevationInFeet,Latitude,Longitude,GPSCode,LocalCode
223,UTK,03N,MH,UTI,OC,Utirik Island,Utirik Airport,small_airport,4,169.852005,11.222,K03N,03N
440,OCA,07FA,US,FL,,Key Largo,Ocean Reef Club Airport,small_airport,8,-80.274803161621,25.325399398804,07FA,07FA
594,PQS,0AK,US,AK,,Pilot Station,Pilot Station Airport,small_airport,305,-162.899994,61.934601,,0AK
673,CSE,0CO2,US,CO,,Crested Butte,Crested Butte Airpark,small_airport,8980,-106.928341,38.851918,0CO2,0CO2
1088,JCY,0TE7,US,TX,,Johnson City,LBJ Ranch Airport,small_airport,1515,-98.6224975586,30.251800537100003,0TE7,0TE7


In [89]:
df_airportcodes.sample(5)

Unnamed: 0,IATACode,AirportID,CountryCode,StateCode,ContinentCode,Municipality,AirportName,TypeOfAirport,ElevationInFeet,Latitude,Longitude,GPSCode,LocalCode
26275,BTR,KBTR,US,LA,,Baton Rouge,Baton Rouge Metropolitan Airport,medium_airport,70.0,-91.149597,30.533199,KBTR,BTR
18600,KOI,EGPA,GB,SCT,EU,Orkney Islands,Kirkwall Airport,medium_airport,50.0,-2.9049999713897705,58.957801818847656,EGPA,
26027,KAE,KAE,US,AK,,Kake,Kake Seaplane Base,seaplane_base,,-133.945999,56.973,,KAE
17543,ILR,DNIL,NG,KW,AF,Ilorin,Ilorin International Airport,medium_airport,1126.0,4.493919849395752,8.440210342407227,DNIL,
31015,RYN,LFCY,FR,T,EU,Royan/MÃ©dis,Royan-MÃ©dis Airport,medium_airport,72.0,-0.9725000262260436,45.62810134887695,LFCY,


Cleaning Global Temperature data

In [90]:
df_temperature.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.0,2.0,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [91]:
df_temperature = df_temperature.rename(columns={'dt':'Date','City':'City','Country':'Country',
                                                'AverageTemperature':'AvgTemperature',
                                                'AverageTemperatureUncertainty':'AvgTempUncertainty',
                                                'Latitude':'Latitude','Longitude':'Longitude'})
                                                

In [92]:
df_temperature = pd.DataFrame(df_temperature, columns = ['Date','City','Country','AvgTemperature',
                                                         'AvgTempUncertainty','Latitude','Longitude'])

In [93]:
# find all unique country codes in temperature data to find used name for United States 
set(df_temperature["Country"].values)

{'Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia And Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burma',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Congo',
 'Congo (Democratic Republic Of The)',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 "Côte D'Ivoire",
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guinea Bissau',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jorda

In [94]:
df_temperature['AvgTemperature'] = df_temperature['AvgTemperature'].round(decimals=3)

In [95]:
df_temperature[(df_temperature.AvgTemperature.isnull()) & (df_temperature.AvgTempUncertainty.isnull())].shape[0]
df_temperature = df_temperature[~((df_temperature.AvgTemperature.isnull()) & (df_temperature.AvgTempUncertainty.isnull()))]

In [96]:
df_temperature.shape[0]

8235082

In [97]:
df_temperature.Country.unique().shape[0]

159

In [98]:
df_temperature.City.unique().shape[0]

3448

In [99]:
df_temperature.Country.unique()

array(['Denmark', 'Turkey', 'Kazakhstan', 'China', 'Spain', 'Germany',
       'Nigeria', 'Iran', 'Russia', 'Canada', "Côte D'Ivoire",
       'United Kingdom', 'Saudi Arabia', 'Japan', 'United States',
       'India', 'Benin', 'United Arab Emirates', 'Mexico', 'Venezuela',
       'Ghana', 'Ethiopia', 'Australia', 'Yemen', 'Indonesia', 'Morocco',
       'Pakistan', 'France', 'Libya', 'Burma', 'Brazil', 'South Africa',
       'Syria', 'Egypt', 'Algeria', 'Netherlands', 'Malaysia', 'Portugal',
       'Ecuador', 'Italy', 'Uzbekistan', 'Philippines', 'Madagascar',
       'Chile', 'Belgium', 'El Salvador', 'Romania', 'Peru', 'Colombia',
       'Tanzania', 'Tunisia', 'Turkmenistan', 'Israel', 'Eritrea',
       'Paraguay', 'Greece', 'New Zealand', 'Vietnam', 'Cameroon', 'Iraq',
       'Afghanistan', 'Argentina', 'Azerbaijan', 'Moldova', 'Mali',
       'Congo (Democratic Republic Of The)', 'Thailand',
       'Central African Republic', 'Bosnia And Herzegovina', 'Bangladesh',
       'Switzerland'

In [100]:
df_temperature.sample(5)

Unnamed: 0,Date,City,Country,AvgTemperature,AvgTempUncertainty,Latitude,Longitude
6845662,1973-09-01,Semnan,Iran,22,1,36.17N,53.70E
3269987,1923-03-01,Itajaí,Brazil,24,1,26.52S,48.36W
8585202,1994-05-01,Zonguldak,Turkey,16,1,40.99N,31.95E
2222984,2005-01-01,Encheng,China,15,0,21.70N,111.63E
3709114,1890-08-01,Kawagoe,Japan,25,1,36.17N,139.23E


In [101]:
us_temperature = df_temperature[df_temperature.Country=='United States']
us_temperature.shape[0]

661524

In [102]:
us_temperature.City.unique().shape[0]

248

In [103]:
us_temperature[((us_temperature[['Date', 'City', 'Country']].duplicated()) & (~us_temperature.City.isin(['Arlington'])) ) ].head()

Unnamed: 0,Date,City,Country,AvgTemperature,AvgTempUncertainty,Latitude,Longitude
485395,1775-04-01,Aurora,United States,9,2,40.99N,87.34W
485396,1775-05-01,Aurora,United States,17,2,40.99N,87.34W
485400,1775-09-01,Aurora,United States,17,2,40.99N,87.34W
485401,1775-10-01,Aurora,United States,12,2,40.99N,87.34W
485402,1775-11-01,Aurora,United States,7,3,40.99N,87.34W


In [104]:
us_temperature[us_temperature[['Date', 'City', 'Country']].duplicated()]['City'].unique()

array(['Arlington', 'Aurora', 'Columbus', 'Glendale', 'Pasadena',
       'Peoria', 'Richmond', 'Springfield'], dtype=object)

In [105]:
us_temperature = us_temperature.drop_duplicates(['Date', 'City', 'Country'],keep= 'first')
us_temperature.shape

(639649, 7)

In [106]:
us_temperature.query("Date=='1820-01-01' and City=='Arlington' and Country=='United States'")

Unnamed: 0,Date,City,Country,AvgTemperature,AvgTempUncertainty,Latitude,Longitude
401036,1820-01-01,Arlington,United States,3,3,32.95N,96.70W


In [107]:
us_temperature.query("Date=='1893-01-01' and City=='Yonkers' and Country=='United States'")

Unnamed: 0,Date,City,Country,AvgTemperature,AvgTempUncertainty,Latitude,Longitude
8437798,1893-01-01,Yonkers,United States,-8,1,40.99N,74.56W


In [108]:
us_temperature = us_temperature.sort_values(by=['Country', 'City', 'AvgTemperature'],  ascending=False)

In [109]:
us_temperature = us_temperature.groupby(['Country', 'City']).tail(5).reset_index(drop=True)

In [110]:
us_temperature.query("City=='Arlington' and Country=='United States'")

Unnamed: 0,Date,City,Country,AvgTemperature,AvgTempUncertainty,Latitude,Longitude
1185,1774-01-01,Arlington,United States,-3,7,39.38N,76.99W
1186,1763-01-01,Arlington,United States,-3,2,39.38N,76.99W
1187,1761-01-01,Arlington,United States,-4,6,39.38N,76.99W
1188,1792-01-01,Arlington,United States,-4,5,39.38N,76.99W
1189,1752-01-01,Arlington,United States,-4,2,39.38N,76.99W


In [111]:
us_temperature.query("City=='Yonkers' and Country=='United States'")

Unnamed: 0,Date,City,Country,AvgTemperature,AvgTempUncertainty,Latitude,Longitude
0,1893-01-01,Yonkers,United States,-8,1,40.99N,74.56W
1,1977-01-01,Yonkers,United States,-8,0,40.99N,74.56W
2,1918-01-01,Yonkers,United States,-9,0,40.99N,74.56W
3,1934-02-01,Yonkers,United States,-9,0,40.99N,74.56W
4,1857-01-01,Yonkers,United States,-9,1,40.99N,74.56W


In [112]:
df_temperature = df_temperature[['City']]
lst = df_temperature.City.str.lower().unique()
us_temperature = us_temperature[~us_temperature.City.str.lower().isin(lst)]

In [113]:
# clear missing temperature values
us_temperature.dropna(inplace=True)

## Define the Data Model

### Conceptual Data Model

## Run Pipelines to Model the Data

### Mapping Out Data Pipelines

### Create the data model

#### Building the data pipelines to create the data model.

In [114]:
%run create_tables.py

In [115]:
conn = psycopg2.connect("host=localhost dbname=nagamohan user=postgres password=Mudu#1977")
cur = conn.cursor()

In [116]:
for index, row in df_demographics.iterrows():
    cur.execute(demographic_insert, list(row.values))
    conn.commit()

In [117]:
for index, row in us_temperature.iterrows():
    cur.execute(temperature_insert, list(row.values))
    conn.commit()

In [118]:
for index, row in df_airportcodes.iterrows():
    cur.execute(airport_insert, list(row.values))
    conn.commit()

TypeError: not all arguments converted during string formatting

In [119]:
for index, row in us_immigration.iterrows():
    cur.execute(immigration_insert, list(row.values))
    conn.commit()

IndexError: list index out of range