### EDA on I94 Immigration Data
- I94YR - 4 digit year
- I94MON - Numeric month
- I94CIT & I94RES - This format shows all the valid and invalid codes for processing
- I94PORT - This format shows all the valid and invalid codes for processing
- ARRDATE is the Arrival Date in the USA. It is a SAS date numeric field that a permament format has not been applied.  Please apply whichever date format works for you.
- I94MODE - There are missing values as well as not reported (9)	
- I94ADDR - There is lots of invalid codes in this variable and the list below shows what we have found to be valid, everything else goes into 'other'
- DEPDATE is the Departure Date from the USA. It is a SAS date numeric field that a permament format has not been applied.  Please apply whichever date format works for you.
- I94BIR - Age of Respondent in Years
- I94VISA - Visa codes collapsed into three categories*/
- COUNT - Used for summary statistics
- DTADFILE - Character Date Field - Date added to I-94 Files - CIC does not use
- VISAPOST - Department of State where where Visa was issued - CIC does not use
- OCCUP - Occupation that will be performed in U.S. - CIC does not use
- ENTDEPA - Arrival Flag - admitted or paroled into the U.S. - CIC does not use
- ENTDEPD - Departure Flag - Departed, lost I-94 or is deceased - CIC does not use
- ENTDEPU - Update Flag - Either apprehended, overstayed, adjusted to perm residence - CIC does not use
- MATFLAG - Match flag - Match of arrival and departure records
- BIRYEAR - 4 digit year of birth
- DTADDTO - Character Date Field - Date to which admitted to U.S. (allowed to stay until) - CIC does not use
- GENDER - Non-immigrant sex
- INSNUM - INS number
- AIRLINE - Airline used to arrive in U.S.
- ADMNUM - Admission Number
- FLTNO - Flight number of Airline used to arrive in U.S.
- VISATYPE - Class of admission legally admitting the non-immigrant to temporarily stay in U.S.

In [1]:
import os
import datetime
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 1028)
import configparser

import boto3

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, LongType, MapType

In [2]:
# config = configparser.ConfigParser()
# config.read('/home/workspace/capstone.cfg')

['/home/workspace/capstone.cfg']

In [3]:
# os.environ["AWS_ACCESS_KEY_ID"] = config['AWS']['AWS_ACCESS_KEY_ID']
# os.environ["AWS_SECRET_ACCESS_KEY"] = config['AWS']['AWS_SECRET_ACCESS_KEY']
# input_data_dir = config['S3']['INPUT_S3_BUCKET']

In [4]:
# Init a SparkSession object
spark = SparkSession \
    .builder \
    .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11,org.apache.hadoop:hadoop-aws:2.7.0") \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
# load i94addr dictionary - map the i94addr values
i94addr_dictionary_spark_df=spark \
    .read \
    .format('csv') \
    .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
    .load(f'{input_data_dir}/dictionary_data/i94addr_dictionary.csv')

# load i94 immigration data
i94_immigration_spark_df =spark.read \
    .format('com.github.saurfang.sas.spark') \
    .load(f'{input_data_dir}/i94_immigration_2016_raw/i94_apr16_sub.sas7bdat',
          forceLowercaseNames=True,
          inferLong=True)

# Cast some columns to the ideal type
i94_immigration_spark_df = i94_immigration_spark_df \
    .withColumn('cicid',  F.col('cicid').cast('integer').cast('string')) \
    .withColumn('i94yr',  F.col('i94yr').cast('integer')) \
    .withColumn('i94mon', F.col('i94mon').cast('integer')) \
    .withColumn('i94cit', F.col('i94cit').cast('integer')) \
    .withColumn("i94res", F.col('i94res').cast('integer')) \
    .withColumn("i94mode", F.col('i94mode').cast('integer')) \
    .withColumn("i94bir", F.col('i94bir').cast('integer')) \
    .withColumn("i94visa", F.col('i94visa').cast('integer'))

# load i94addr dictionary - map the i94addr values
i94_immigration_spark_df = i94_immigration_spark_df \
    .join(i94addr_dictionary_spark_df, i94_immigration_spark_df.i94addr==i94addr_dictionary_spark_df.key, 'left') \
    .drop('key', 'i94addr') \
    .withColumnRenamed('value', 'i94addr')

# around 5% records, should not drop them
# i94_immigration_spark_df.filter(F.col('value').isNull()).count() / i94_immigration_spark_df.count()

i94_immigration_spark_df = i94_immigration_spark_df.fillna('NOT_IN_i94addr_DICTIONARY', ['i94addr'])

In [6]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,i94addr
0,6,2016,4,692,692,XXX,20573.0,,,37,2,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,NOT_IN_i94addr_DICTIONARY
1,7,2016,4,254,276,ATL,20551.0,1.0,,25,3,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,ALABAMA
2,15,2016,4,101,101,WAS,20545.0,1.0,20691.0,55,2,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MICHIGAN
3,16,2016,4,101,101,NYC,20545.0,1.0,20567.0,28,2,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS
4,17,2016,4,101,101,NYC,20545.0,1.0,20567.0,4,2,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS
5,18,2016,4,101,101,NYC,20545.0,1.0,20555.0,57,1,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1,MICHIGAN
6,19,2016,4,101,101,NYC,20545.0,1.0,20558.0,63,2,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2,NEW JERSEY
7,20,2016,4,101,101,NYC,20545.0,1.0,20558.0,57,2,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2,NEW JERSEY
8,21,2016,4,101,101,NYC,20545.0,1.0,20553.0,46,2,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2,NEW YORK
9,22,2016,4,101,101,NYC,20545.0,1.0,20562.0,48,1,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1,NEW YORK


In [7]:
# load i94cit dictionary - map the i94cit values
i94cit_dictionary_spark_df=spark \
    .read \
    .format('csv') \
    .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
    .load(f'{input_data_dir}/dictionary_data/i94cit_dictionary.csv')
# i94cit_dictionary_spark_df - replace values in value cells of records whose value cells start with "INVALID: ", "No Country Code " and "should not show"
i94cit_dictionary_spark_df = i94cit_dictionary_spark_df.withColumn('value', 
                                      F.when(F.col('value').rlike('INVALID: ') | F.col('value').rlike('No Country Code ') | F.col('value').rlike('should not show')
                                             , 'IN_i94cit_DICTIONARY_BUT_INVALID_UNKNOWN_NOTSHOW') \
                                     .otherwise(F.col('value')))

i94_immigration_spark_df = i94_immigration_spark_df \
    .join(i94cit_dictionary_spark_df, i94_immigration_spark_df.i94cit==i94cit_dictionary_spark_df.key, 'left') \
    .drop('key', 'i94cit') \
    .withColumnRenamed('value', 'i94cit')

i94_immigration_spark_df = i94_immigration_spark_df.fillna('NOT_IN_i94cit_DICTIONARY', ['i94cit'])

In [8]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,i94res,i94port,arrdate,i94mode,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,i94addr,i94cit
0,6,2016,4,692,XXX,20573.0,,,37,2,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,NOT_IN_i94addr_DICTIONARY,ECUADOR
1,7,2016,4,276,ATL,20551.0,1.0,,25,3,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,ALABAMA,NOT_IN_i94cit_DICTIONARY
2,15,2016,4,101,WAS,20545.0,1.0,20691.0,55,2,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MICHIGAN,ALBANIA
3,16,2016,4,101,NYC,20545.0,1.0,20567.0,28,2,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA
4,17,2016,4,101,NYC,20545.0,1.0,20567.0,4,2,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA
5,18,2016,4,101,NYC,20545.0,1.0,20555.0,57,1,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1,MICHIGAN,ALBANIA
6,19,2016,4,101,NYC,20545.0,1.0,20558.0,63,2,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2,NEW JERSEY,ALBANIA
7,20,2016,4,101,NYC,20545.0,1.0,20558.0,57,2,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2,NEW JERSEY,ALBANIA
8,21,2016,4,101,NYC,20545.0,1.0,20553.0,46,2,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2,NEW YORK,ALBANIA
9,22,2016,4,101,NYC,20545.0,1.0,20562.0,48,1,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1,NEW YORK,ALBANIA


In [9]:
# load i94mode dictionary - map the i94mode values
i94mode_dictionary_spark_df=spark \
    .read \
    .format('csv') \
    .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
    .load(f'{input_data_dir}/dictionary_data/i94mode_dictionary.csv')

i94_immigration_spark_df = i94_immigration_spark_df \
    .join(i94mode_dictionary_spark_df, i94_immigration_spark_df.i94mode==i94mode_dictionary_spark_df.key, 'left') \
    .drop('key', 'i94mode') \
    .withColumnRenamed('value', 'i94mode')

i94_immigration_spark_df = i94_immigration_spark_df.fillna('NOT_IN_i94mode_DICTIONARY', ['i94mode'])

In [10]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,i94res,i94port,arrdate,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,i94addr,i94cit,i94mode
0,6,2016,4,692,XXX,20573.0,,37,2,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,NOT_IN_i94addr_DICTIONARY,ECUADOR,NOT_IN_i94mode_DICTIONARY
1,7,2016,4,276,ATL,20551.0,,25,3,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,ALABAMA,NOT_IN_i94cit_DICTIONARY,Air
2,15,2016,4,101,WAS,20545.0,20691.0,55,2,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MICHIGAN,ALBANIA,Air
3,16,2016,4,101,NYC,20545.0,20567.0,28,2,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air
4,17,2016,4,101,NYC,20545.0,20567.0,4,2,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air
5,18,2016,4,101,NYC,20545.0,20555.0,57,1,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1,MICHIGAN,ALBANIA,Air
6,19,2016,4,101,NYC,20545.0,20558.0,63,2,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2,NEW JERSEY,ALBANIA,Air
7,20,2016,4,101,NYC,20545.0,20558.0,57,2,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2,NEW JERSEY,ALBANIA,Air
8,21,2016,4,101,NYC,20545.0,20553.0,46,2,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2,NEW YORK,ALBANIA,Air
9,22,2016,4,101,NYC,20545.0,20562.0,48,1,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1,NEW YORK,ALBANIA,Air


In [11]:
# load i94port dictionary - map the i94port values
i94port_dictionary_spark_df=spark \
    .read \
    .format('csv') \
    .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
    .load(f'{input_data_dir}/dictionary_data/i94port_dictionary.csv')

# # i94port_dictionary_spark_df - fitler out records whose value cells start with "No PORT Code "
i94port_dictionary_spark_df = i94port_dictionary_spark_df.withColumn('value', 
                                      F.when(F.col('value').rlike('No PORT Code '), 'IN_i94cit_DICTIONARY_BUT_NO_PORT_CODE') \
                                     .otherwise(F.col('value')))

i94_immigration_spark_df = i94_immigration_spark_df \
    .join(i94port_dictionary_spark_df, i94_immigration_spark_df.i94port==i94port_dictionary_spark_df.key, 'left') \
    .drop('key', 'i94port') \
    .withColumnRenamed('value', 'i94port')

i94_immigration_spark_df = i94_immigration_spark_df.fillna('NOT_IN_i94port_DICTIONARY', ['i94port'])

In [12]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,i94res,arrdate,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,i94addr,i94cit,i94mode,i94port
0,6,2016,4,692,20573.0,,37,2,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,NOT_IN_i94addr_DICTIONARY,ECUADOR,NOT_IN_i94mode_DICTIONARY,NOT REPORTED/UNKNOWN
1,7,2016,4,276,20551.0,,25,3,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,ALABAMA,NOT_IN_i94cit_DICTIONARY,Air,"ATLANTA, GA"
2,15,2016,4,101,20545.0,20691.0,55,2,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MICHIGAN,ALBANIA,Air,WASHINGTON DC
3,16,2016,4,101,20545.0,20567.0,28,2,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air,"NEW YORK, NY"
4,17,2016,4,101,20545.0,20567.0,4,2,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air,"NEW YORK, NY"
5,18,2016,4,101,20545.0,20555.0,57,1,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1,MICHIGAN,ALBANIA,Air,"NEW YORK, NY"
6,19,2016,4,101,20545.0,20558.0,63,2,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2,NEW JERSEY,ALBANIA,Air,"NEW YORK, NY"
7,20,2016,4,101,20545.0,20558.0,57,2,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2,NEW JERSEY,ALBANIA,Air,"NEW YORK, NY"
8,21,2016,4,101,20545.0,20553.0,46,2,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2,NEW YORK,ALBANIA,Air,"NEW YORK, NY"
9,22,2016,4,101,20545.0,20562.0,48,1,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1,NEW YORK,ALBANIA,Air,"NEW YORK, NY"


In [13]:
# load i94res dictionary - map the i94res values
i94res_dictionary_spark_df=spark \
    .read \
    .format('csv') \
    .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
    .load(f'{input_data_dir}/dictionary_data/i94res_dictionary.csv')

# i94res_dictionary_spark_df - replace values in value cells of records whose value cells start with "INVALID: ", "No Country Code " and "should not show"
i94res_dictionary_spark_df = i94res_dictionary_spark_df.withColumn('value', 
                                      F.when(F.col('value').rlike('INVALID: ') | F.col('value').rlike('No Country Code ') | F.col('value').rlike('should not show')
                                             , 'IN_i94res_DICTIONARY_BUT_INVALID_UNKNOWN_NOTSHOW') \
                                     .otherwise(F.col('value')))

i94_immigration_spark_df = i94_immigration_spark_df \
    .join(i94res_dictionary_spark_df, i94_immigration_spark_df.i94res==i94res_dictionary_spark_df.key, 'left') \
    .drop('key', 'i94res') \
    .withColumnRenamed('value', 'i94res')

i94_immigration_spark_df = i94_immigration_spark_df.fillna('NOT_IN_i94res_DICTIONARY', ['i94res'])

In [14]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,arrdate,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,i94addr,i94cit,i94mode,i94port,i94res
0,6,2016,4,20573.0,,37,2,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,NOT_IN_i94addr_DICTIONARY,ECUADOR,NOT_IN_i94mode_DICTIONARY,NOT REPORTED/UNKNOWN,ECUADOR
1,7,2016,4,20551.0,,25,3,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,ALABAMA,NOT_IN_i94cit_DICTIONARY,Air,"ATLANTA, GA",SOUTH KOREA
2,15,2016,4,20545.0,20691.0,55,2,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MICHIGAN,ALBANIA,Air,WASHINGTON DC,ALBANIA
3,16,2016,4,20545.0,20567.0,28,2,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air,"NEW YORK, NY",ALBANIA
4,17,2016,4,20545.0,20567.0,4,2,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air,"NEW YORK, NY",ALBANIA
5,18,2016,4,20545.0,20555.0,57,1,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1,MICHIGAN,ALBANIA,Air,"NEW YORK, NY",ALBANIA
6,19,2016,4,20545.0,20558.0,63,2,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2,NEW JERSEY,ALBANIA,Air,"NEW YORK, NY",ALBANIA
7,20,2016,4,20545.0,20558.0,57,2,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2,NEW JERSEY,ALBANIA,Air,"NEW YORK, NY",ALBANIA
8,21,2016,4,20545.0,20553.0,46,2,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2,NEW YORK,ALBANIA,Air,"NEW YORK, NY",ALBANIA
9,22,2016,4,20545.0,20562.0,48,1,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1,NEW YORK,ALBANIA,Air,"NEW YORK, NY",ALBANIA


In [15]:
# load i94visa dictionary - map the i94visa values
i94visa_dictionary_spark_df=spark \
    .read \
    .format('csv') \
    .options(header='true', inferSchema='true', encoding="ISO-8859-1") \
    .load(f'{input_data_dir}/dictionary_data/i94visa_dictionary.csv')

i94_immigration_spark_df = i94_immigration_spark_df \
    .join(i94visa_dictionary_spark_df, i94_immigration_spark_df.i94visa==i94visa_dictionary_spark_df.key, 'left') \
    .drop('key', 'i94visa') \
    .withColumnRenamed('value', 'i94visa')

i94_immigration_spark_df = i94_immigration_spark_df.fillna('NOT_IN_i94visa_DICTIONARY', ['i94visa'])

In [16]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,arrdate,depdate,i94bir,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype,i94addr,i94cit,i94mode,i94port,i94res,i94visa
0,6,2016,4,20573.0,,37,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2,NOT_IN_i94addr_DICTIONARY,ECUADOR,NOT_IN_i94mode_DICTIONARY,NOT REPORTED/UNKNOWN,ECUADOR,Pleasure
1,7,2016,4,20551.0,,25,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1,ALABAMA,NOT_IN_i94cit_DICTIONARY,Air,"ATLANTA, GA",SOUTH KOREA,Student
2,15,2016,4,20545.0,20691.0,55,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2,MICHIGAN,ALBANIA,Air,WASHINGTON DC,ALBANIA,Pleasure
3,16,2016,4,20545.0,20567.0,28,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Pleasure
4,17,2016,4,20545.0,20567.0,4,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2,MASSACHUSETTS,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Pleasure
5,18,2016,4,20545.0,20555.0,57,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1,MICHIGAN,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Business
6,19,2016,4,20545.0,20558.0,63,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2,NEW JERSEY,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Pleasure
7,20,2016,4,20545.0,20558.0,57,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2,NEW JERSEY,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Pleasure
8,21,2016,4,20545.0,20553.0,46,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2,NEW YORK,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Pleasure
9,22,2016,4,20545.0,20562.0,48,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1,NEW YORK,ALBANIA,Air,"NEW YORK, NY",ALBANIA,Business


In [17]:
i94_immigration_spark_df = i94_immigration_spark_df.select('cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port', 'arrdate', 'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa', 'count', 'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu', 'matflag', 'biryear', 'dtaddto', 'gender', 'insnum', 'airline', 'admnum', 'fltno', 'visatype')

In [18]:
# i94_immigration_spark_df.limit(10).toPandas()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,i94bir,i94visa,count,dtadfile,visapost,occup,entdepa,entdepd,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,6,2016,4,ECUADOR,ECUADOR,NOT REPORTED/UNKNOWN,20573.0,NOT_IN_i94mode_DICTIONARY,NOT_IN_i94addr_DICTIONARY,,37,Pleasure,1.0,,,,T,,U,,1979.0,10282016,,,,1897628000.0,,B2
1,7,2016,4,NOT_IN_i94cit_DICTIONARY,SOUTH KOREA,"ATLANTA, GA",20551.0,Air,ALABAMA,,25,Student,1.0,20130811.0,SEO,,G,,Y,,1991.0,D/S,M,,,3736796000.0,296.0,F1
2,15,2016,4,ALBANIA,ALBANIA,WASHINGTON DC,20545.0,Air,MICHIGAN,20691.0,55,Pleasure,1.0,20160401.0,,,T,O,,M,1961.0,09302016,M,,OS,666643200.0,93.0,B2
3,16,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,MASSACHUSETTS,20567.0,28,Pleasure,1.0,20160401.0,,,O,O,,M,1988.0,09302016,,,AA,92468460000.0,199.0,B2
4,17,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,MASSACHUSETTS,20567.0,4,Pleasure,1.0,20160401.0,,,O,O,,M,2012.0,09302016,,,AA,92468460000.0,199.0,B2
5,18,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,MICHIGAN,20555.0,57,Business,1.0,20160401.0,,,O,O,,M,1959.0,09302016,,,AZ,92471040000.0,602.0,B1
6,19,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,NEW JERSEY,20558.0,63,Pleasure,1.0,20160401.0,,,O,K,,M,1953.0,09302016,,,AZ,92471400000.0,602.0,B2
7,20,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,NEW JERSEY,20558.0,57,Pleasure,1.0,20160401.0,,,O,K,,M,1959.0,09302016,,,AZ,92471610000.0,602.0,B2
8,21,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,NEW YORK,20553.0,46,Pleasure,1.0,20160401.0,,,O,O,,M,1970.0,09302016,,,AZ,92470800000.0,602.0,B2
9,22,2016,4,ALBANIA,ALBANIA,"NEW YORK, NY",20545.0,Air,NEW YORK,20562.0,48,Business,1.0,20160401.0,,,O,O,,M,1968.0,09302016,,,AZ,92478490000.0,608.0,B1


In [19]:
spark.stop()