# Data Setup and Exploration

### **Setup**

**Import Required Libraries**

In [1]:
import glob
import gzip
import os
import pandas as pd
import requests
import zipfile

from google.colab import drive, files

**Load Datasets into Local Colab Storage**

The data is sourced from CareerBuilder.com's [Job Recommendation Challenge](https://www.kaggle.com/c/job-recommendation/data), hosted on Kaggle in 2012.

*Technical Notes:*
 
*   Because the files are large, this cell takes a few minutes to run (< 5 minutes).

*   Unfortunately, data does not persist in the local storage system between sessions, so you have to rerun this cell from time to time.

In [2]:
# If data files are not already in local storage
if not os.path.isdir("data"):

  # Retrieve zip file from Dropbox and write to base/default folder
  r = requests.get("https://www.dropbox.com/s/v2fdobitjrjieku/data.zip?dl=1")
  with open("data.zip", 'wb') as f:
      f.write(r.content)

  # Extract zip file contents to create local data folder with .tsv.gz files
  with zipfile.ZipFile("data.zip", 'r') as zip_ref:
      zip_ref.extractall(".")

  # For each unzipped file path
  for path in glob.glob("data/*.tsv.gz"):

    # Create destination file path
    dest_path = f'data/{os.path.basename(path)[:-3]}'

    # Open unzipped file for reading and destination file for writing
    with open(path, 'rb') as f:
      with open(dest_path, 'wb') as g:

            # Decompress unzipped file data and write to destination
            decompressed = gzip.decompress(f.read())
            g.write(decompressed)

    # Delete original compressed file
    os.remove(path)

  # Delete zip file
  os.remove("data.zip")

### **Exploration**

**Users**

Potential disadvantaged groups to examine:


*   Users who have a high-school diploma or less
*   Users based in zip codes associated with lower incomes/mobility
*   Users whose graduation date would put them in an older age bracket

*users.tsv - Holds all users and their metadata*

In [None]:
# File Preview
users = pd.read_csv("data/users.tsv", sep="\t")
users.head(5)

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0


In [None]:
# Degree type counts
users["DegreeType"].value_counts().to_frame()

Unnamed: 0,DegreeType
Bachelor's,104210
,100153
High School,93305
Associate's,45786
Master's,35330
Vocational,6981
PhD,3943


In [None]:
# Total number of users in dataset
len(users)

389708

*users_history.tsv - Holds users' past job title(s)*

In [None]:
# File preview
user_history = pd.read_csv("data/user_history.tsv", sep="\t")
user_history.head(5)

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
0,47,1,Train,1,National Space Communication Programs-Special ...
1,47,1,Train,2,Detention Officer
2,47,1,Train,3,"Passenger Screener, TSA"
3,72,1,Train,1,"Lecturer, Department of Anthropology"
4,72,1,Train,2,Student Assistant


In [None]:
# Example job titles for a random user
list(user_history.query("UserID == 47")["JobTitle"])

['National Space Communication Programs-Special Program Supervisor',
 'Detention Officer',
 'Passenger Screener, TSA']

**Jobs**

*jobs.tsv: Holds the jobs available on CareerBuilder.com during a 13-day window*

In [None]:
# File preview for jobs listed in the first of the seven windows
# Note: This file has an error in one of its lines that should be corrected
jobs1 = pd.read_csv("data/jobs1.tsv", sep="\t", error_bad_lines=False)
jobs1.head(5)

b'Skipping line 122433: expected 11 fields, saw 12\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,1,1,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
1,4,1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
2,7,1,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59


In [None]:
# Number of jobs in first window
len(jobs1)

285091

**Apps**

*apps.tsv: Holds the applications users submitted*

In [None]:
# File preview
apps = pd.read_csv("data/apps.tsv", sep="\t")
apps.head(5)

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID
0,47,1,Train,2012-04-04 15:56:23.537,169528
1,47,1,Train,2012-04-06 01:03:00.003,284009
2,47,1,Train,2012-04-05 02:40:27.753,2121
3,47,1,Train,2012-04-05 02:37:02.673,848187
4,47,1,Train,2012-04-05 22:44:06.653,733748


In [None]:
# Distribution of the number of applications submitted per user
apps["UserID"].value_counts().describe().to_frame().rename(columns={"UserID": "App Submissions"})

Unnamed: 0,App Submissions
count,321235.0
mean,4.990462
std,11.418487
min,1.0
25%,1.0
50%,2.0
75%,5.0
max,2473.0


**Window Dates**

*window_dates.tsv: Holds the application window dates*

In [None]:
# File preview
window_dates = pd.read_csv("data/window_dates.tsv", sep="\t")
window_dates.head(5)

Unnamed: 0,Window,Train Start,Train End / Test Start,Test End
0,1,2012-04-01 00:00:00,2012-04-10 00:00:00,2012-04-14 00:00:00
1,2,2012-04-14 00:00:00,2012-04-23 00:00:00,2012-04-27 00:00:00
2,3,2012-04-27 00:00:00,2012-05-06 00:00:00,2012-05-10 00:00:00
3,4,2012-05-10 00:00:00,2012-05-19 00:00:00,2012-05-23 00:00:00
4,5,2012-05-23 00:00:00,2012-06-01 00:00:00,2012-06-05 00:00:00
