In [7]:
import csv, copy
from collections import defaultdict
import numpy, re, math
from sklearn.linear_model import Ridge, LinearRegression
from textstat.textstat import textstat
from string import punctuation
from scipy import stats

In [30]:
# read data from csv file
data = []
with open("h1b_kaggle1.csv", encoding='utf8') as csvfile:
    csvreader = csv.reader(csvfile)
    for line in csvreader:
        data.append(line)

In [31]:
print(len(data), data[0], data[10])

3002459 ['', 'CASE_STATUS', 'EMPLOYER_NAME', 'SOC_NAME', 'JOB_TITLE', 'FULL_TIME_POSITION', 'PREVAILING_WAGE', 'YEAR', 'WORKSITE', 'lon', 'lat'] ['10', 'WITHDRAWN', 'LESSARD INTERNATIONAL LLC', 'CHIEF EXECUTIVES', 'PRESIDENT', 'Y', '154648', '2016', 'VIENNA, VIRGINIA', '-77.2652604', '38.9012225']


In [37]:
print(data[10])

['10', 'WITHDRAWN', 'LESSARD INTERNATIONAL LLC', 'CHIEF EXECUTIVES', 'PRESIDENT', 'Y', '154648', '2016', 'VIENNA, VIRGINIA', '-77.2652604', '38.9012225']


In [39]:
# getting statistics of data in terms of the features

case_status = defaultdict(int)
employer_name = defaultdict(int)
soc_name = defaultdict(int)
job_title = defaultdict(int)
full_time = defaultdict(int)
year = defaultdict(int)
worksite = defaultdict(int)

for d in data[1:]:
    if d[0] == '':
        continue
    case_status[d[1]] += 1
    employer_name[d[2]] += 1
    soc_name[d[3]] += 1
    #job_title[d[4]] += 1
    full_time[d[5]] += 1
    year[d[7]] += 1
    #worksite[d[8]] += 1

In [41]:
print(soc_name.values())

dict_values([76, 10, 370, 25, 96, 3, 12, 1, 1, 35, 11, 18, 74, 1, 6, 1, 86, 68, 68, 3, 1, 6, 784, 87, 513, 97, 1035, 1, 6, 2350, 1949, 1585, 223, 1, 27, 6, 7, 2, 124, 10, 4, 6, 164, 1047, 105, 3, 7, 7752, 2, 1, 39, 1576, 97, 1145, 26219, 1, 4, 1, 1, 11, 228, 3, 4, 20, 41, 70, 116, 3, 239, 246, 2, 1, 523, 4, 210, 111, 1, 930, 6, 1, 2, 10, 1, 8, 1, 1502, 2, 1, 2, 16, 2, 22, 1, 7, 3, 29, 81, 13, 1, 1509, 1, 2101, 1, 16, 695, 608, 197, 6, 10, 1289, 32, 62, 2, 914, 37, 30, 1, 4, 365, 71, 1, 45, 8, 11, 1, 143, 14, 479, 5, 2243, 14, 2463, 1, 4901, 307, 1, 1, 1, 1, 14, 4, 20, 4, 3, 1, 822, 1, 1, 423, 2, 342, 10, 1, 1, 67, 18, 34, 44, 421, 45, 1, 119, 21642, 3, 1, 1911, 3, 7032, 1, 2, 10, 1, 22, 47, 1, 11589, 20, 2, 9, 1, 5, 6, 12, 8, 2, 5, 6, 4050, 1, 1, 1, 7, 442, 122, 19, 1, 1, 5, 2, 126, 61, 1, 838, 1, 18, 3, 1, 60, 9, 2, 5, 1144, 1, 1, 394, 2, 24, 4, 469, 3, 1, 549, 1, 938, 1, 2, 1, 698, 388, 1, 17426, 1, 47, 3, 2, 1, 2970, 10, 5, 10, 1, 5380, 2, 1, 42, 13, 19, 35, 1, 199, 41, 223, 1, 14, 

In [12]:
categories_case_status = ['CERTIFIED', 'WITHDRAWN', 'CERTIFIED-WITHDRAWN', 'DENIED']

# remove outliers from dictionaries and maintain the new values in '_filt' notation

case_status_filt = copy.deepcopy(case_status)
#employer_name_filt = employer_name
soc_name_filt = soc_name
#job_title_filt = job_title
full_time_filt = full_time
year_filt = copy.deepcopy(year)
#worksite_filt = copy.deepcopy(worksite)

# only 4 classes to be retained
for i in case_status.copy().keys():
    print(i)
    if i not in categories_case_status:
        case_status_filt.pop(i)

year_filt.pop('NA')
#full_time_filt.pop('NA')

WITHDRAWN
INVALIDATED
CERTIFIED
CERTIFIED-WITHDRAWN
PENDING QUALITY AND COMPLIANCE REVIEW - UNASSIGNED
REJECTED
DENIED
NA


13

In [13]:
print(case_status_filt)
print(year_filt)
print(full_time_filt)

defaultdict(<class 'int'>, {'WITHDRAWN': 89799, 'CERTIFIED': 2615623, 'CERTIFIED-WITHDRAWN': 202659, 'DENIED': 94346})
defaultdict(<class 'int'>, {'2012': 415607, '2014': 519427, '2015': 618727, '2011': 358767, '2013': 442114, '2016': 647803})
defaultdict(<class 'int'>, {'Y': 2576111, 'NA': 15, 'N': 426332})


In [14]:
# initialising global variables

numSocNames = len(soc_name_filt.keys())
numYears = len(year_filt.keys())
numClasses = 4

# to be used in 1-hot encoding of features (this is a mapping of each key of a feature to )
class_id = dict(zip(case_status_filt, range(numClasses)))
soc_name_id = dict(zip(soc_name_filt, range(numSocNames)))
year_id = dict(zip(year_filt, range(numYears)))
full_time_id = dict(zip(full_time_filt, range(2)))

In [15]:
# removing outliers and generating filtered data - more analysis needs to be done with this part!

data_filt = []
for d in data[1:]:
    # remove if case status is not out of 4 classes
    if d[1] not in categories_case_status or d[3] not in soc_name.keys() or \
        d[5] not in full_time.keys() or d[7] not in year.keys() or d[9] == 'NA' or \
        d[10] == 'NA' or d[6] == 'NA':
        continue
    data_filt.append(d)

In [16]:
numTrain = 2500000
numTrain = 10000

In [17]:
def feature(d):
    feat = [0]*(numSocNames+numYears)
    #print(len(feat))
    offset = 0
    #print(offset+soc_name_id[d[3]])
    feat[offset+soc_name_id[d[3]]] = 1   # soc_name
    offset += numSocNames
    #print(offset+year_id[d[7]])
    feat[offset+year_id[d[7]]] = 1   # year
    offset += numYears
    feat.append(1 if d[5] == 'Y' else -1)   # full time
    feat.append(float(d[6]))   # wage
    feat.append(float(d[9]))   # latitude
    feat.append(float(d[10]))  # longitude
    feat.append(1) # bias
    return feat
#feature(data[10])

In [18]:
X_train = [feature(d) for d in data_filt[:numTrain]]
y_train = [class_id[d[1]] for d in data_filt[:numTrain]]

In [19]:
# simple logistic regression, various models need to be looked at
from sklearn import linear_model
clf = linear_model.LogisticRegression()

In [20]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [21]:
X_valid = [feature(d) for d in data_filt[-numTrain:]]
y_valid = [class_id[d[1]] for d in data_filt[-numTrain:]]

In [22]:
y_pred = clf.predict(X_valid)

In [23]:
numpy.mean((y_pred-y_valid)**2)

0.45839999999999997