In [29]:
import mlxtend
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df = pd.read_csv("filtered_h1b.csv")

In [3]:
rule_mining_1 = ['CASE_STATUS','SOC_TITLE','FULL_TIME_POSITION','H_1B_DEPENDENT','PUBLIC_DISCLOSURE']

In [4]:
df1 = df[rule_mining_1]

In [5]:
df1

Unnamed: 0,CASE_STATUS,SOC_TITLE,FULL_TIME_POSITION,H_1B_DEPENDENT,PUBLIC_DISCLOSURE
0,Certified - Withdrawn,Operations Research Analysts,Y,No,Disclose Business
1,Certified - Withdrawn,Management Analysts,Y,No,Disclose Business
2,Certified - Withdrawn,Computer and Information Systems Managers,Y,,Disclose Business
3,Certified - Withdrawn,Computer Systems Analysts,Y,Yes,Disclose Business
4,Withdrawn,General and Operations Managers,Y,,Disclose Business
...,...,...,...,...,...
80044,Certified - Withdrawn,Electrical Engineers,Y,No,Disclose Business
80045,Certified - Withdrawn,Electrical Engineers,Y,No,Disclose Business
80046,Withdrawn,"Business Operations Specialists, All Other",Y,,Disclose Business and Employment
80047,Certified - Withdrawn,"Software Developers, Applications",Y,No,Disclose Business


In [9]:
cert = df1[(df1['CASE_STATUS']=='Certified - Withdrawn') | (df1['CASE_STATUS']=='Certified') ]

In [10]:
cert

Unnamed: 0,CASE_STATUS,SOC_TITLE,FULL_TIME_POSITION,H_1B_DEPENDENT,PUBLIC_DISCLOSURE
0,Certified - Withdrawn,Operations Research Analysts,Y,No,Disclose Business
1,Certified - Withdrawn,Management Analysts,Y,No,Disclose Business
2,Certified - Withdrawn,Computer and Information Systems Managers,Y,,Disclose Business
3,Certified - Withdrawn,Computer Systems Analysts,Y,Yes,Disclose Business
6,Certified - Withdrawn,Computer Programmers,Y,Yes,Disclose Business
...,...,...,...,...,...
80043,Certified,Computer Systems Analysts,Y,Yes,Disclose Business
80044,Certified - Withdrawn,Electrical Engineers,Y,No,Disclose Business
80045,Certified - Withdrawn,Electrical Engineers,Y,No,Disclose Business
80047,Certified - Withdrawn,"Software Developers, Applications",Y,No,Disclose Business


In [19]:
transactions = df1.groupby(['SOC_TITLE','FULL_TIME_POSITION','H_1B_DEPENDENT',
       'PUBLIC_DISCLOSURE']).size().reset_index(name='count')

In [20]:
transactions

Unnamed: 0,SOC_TITLE,FULL_TIME_POSITION,H_1B_DEPENDENT,PUBLIC_DISCLOSURE,count
0,Accountants and Auditors,N,No,Disclose Business,62
1,Accountants and Auditors,N,No,Disclose Employment,6
2,Accountants and Auditors,N,Yes,Disclose Employment,1
3,Accountants and Auditors,Y,No,Disclose Business,1734
4,Accountants and Auditors,Y,No,Disclose Business and Employment,56
...,...,...,...,...,...
661,Web and Digital Interface Designers,Y,No,Disclose Business,138
662,Web and Digital Interface Designers,Y,No,Disclose Business and Employment,6
663,Web and Digital Interface Designers,Y,No,Disclose Employment,14
664,Web and Digital Interface Designers,Y,Yes,Disclose Business,121


In [23]:
transaction_list = []

for index, row in transactions.iterrows():
    transaction_list.append([row['SOC_TITLE'], row['FULL_TIME_POSITION'],row['H_1B_DEPENDENT'],row['PUBLIC_DISCLOSURE']])

In [25]:
te = TransactionEncoder()

In [26]:
te_ary = te.fit(transaction_list).transform(transaction_list)

In [27]:
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [46]:
# Get frequent itemsets
frequent_itemsets1 = apriori(df_encoded, min_support=0.5, use_colnames=True)

In [47]:
frequent_itemsets1

Unnamed: 0,support,itemsets
0,0.504505,(Disclose Business)
1,0.732733,(No)
2,0.684685,(Y)


In [67]:
rules1 = association_rules(frequent_itemsets1, metric="confidence", min_threshold=0.1)

In [65]:
rules1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


In [48]:
# Get frequent itemsets
frequent_itemsets2 = apriori(df_encoded, min_support=0.3, use_colnames=True)

In [49]:
frequent_itemsets2

Unnamed: 0,support,itemsets
0,0.504505,(Disclose Business)
1,0.315315,(N)
2,0.732733,(No)
3,0.684685,(Y)
4,0.315315,"(No, Disclose Business)"
5,0.48048,"(No, Y)"


In [68]:
rules2 = association_rules(frequent_itemsets2, metric="confidence", min_threshold=0.1)

In [69]:
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(No),(Disclose Business),0.732733,0.504505,0.315315,0.430328,0.852971,-0.054352,0.869791,-0.392077
1,(Disclose Business),(No),0.504505,0.732733,0.315315,0.625,0.852971,-0.054352,0.712713,-0.258094
2,(No),(Y),0.732733,0.684685,0.48048,0.655738,0.957722,-0.02121,0.915916,-0.141755
3,(Y),(No),0.684685,0.732733,0.48048,0.701754,0.957722,-0.02121,0.896131,-0.122807


In [50]:
# Get frequent itemsets
frequent_itemsets3 = apriori(df_encoded, min_support=0.1, use_colnames=True)

In [51]:
frequent_itemsets3

Unnamed: 0,support,itemsets
0,0.504505,(Disclose Business)
1,0.249249,(Disclose Business and Employment)
2,0.246246,(Disclose Employment)
3,0.315315,(N)
4,0.732733,(No)
5,0.684685,(Y)
6,0.267267,(Yes)
7,0.207207,"(Disclose Business, N)"
8,0.315315,"(No, Disclose Business)"
9,0.297297,"(Disclose Business, Y)"


In [70]:
rules3 = association_rules(frequent_itemsets3, metric="confidence", min_threshold=0.1)

In [71]:
rules3

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Disclose Business),(N),0.504505,0.315315,0.207207,0.410714,1.302551,0.048129,1.161889,0.468775
1,(N),(Disclose Business),0.315315,0.504505,0.207207,0.657143,1.302551,0.048129,1.445195,0.339245
2,(No),(Disclose Business),0.732733,0.504505,0.315315,0.430328,0.852971,-0.054352,0.869791,-0.392077
3,(Disclose Business),(No),0.504505,0.732733,0.315315,0.625,0.852971,-0.054352,0.712713,-0.258094
4,(Disclose Business),(Y),0.504505,0.684685,0.297297,0.589286,0.860667,-0.048129,0.767724,-0.246262
5,(Y),(Disclose Business),0.684685,0.504505,0.297297,0.434211,0.860667,-0.048129,0.875759,-0.339245
6,(Disclose Business),(Yes),0.504505,0.267267,0.189189,0.375,1.40309,0.054352,1.172372,0.579798
7,(Yes),(Disclose Business),0.267267,0.504505,0.189189,0.707865,1.40309,0.054352,1.696119,0.392077
8,(No),(Disclose Business and Employment),0.732733,0.249249,0.204204,0.278689,1.118112,0.021571,1.040814,0.395241
9,(Disclose Business and Employment),(No),0.249249,0.732733,0.204204,0.819277,1.118112,0.021571,1.478879,0.140706


In [56]:
# Get frequent itemsets
frequent_itemsets4 = apriori(df_encoded, min_support=0.05, use_colnames=True)

In [57]:
frequent_itemsets4

Unnamed: 0,support,itemsets
0,0.504505,(Disclose Business)
1,0.249249,(Disclose Business and Employment)
2,0.246246,(Disclose Employment)
3,0.315315,(N)
4,0.732733,(No)
5,0.684685,(Y)
6,0.267267,(Yes)
7,0.207207,"(Disclose Business, N)"
8,0.315315,"(No, Disclose Business)"
9,0.297297,"(Disclose Business, Y)"


In [72]:
rules4 = association_rules(frequent_itemsets4, metric="confidence", min_threshold=0.1)

In [73]:
rules4

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Disclose Business),(N),0.504505,0.315315,0.207207,0.410714,1.302551,0.048129,1.161889,0.468775
1,(N),(Disclose Business),0.315315,0.504505,0.207207,0.657143,1.302551,0.048129,1.445195,0.339245
2,(No),(Disclose Business),0.732733,0.504505,0.315315,0.430328,0.852971,-0.054352,0.869791,-0.392077
3,(Disclose Business),(No),0.504505,0.732733,0.315315,0.625000,0.852971,-0.054352,0.712713,-0.258094
4,(Disclose Business),(Y),0.504505,0.684685,0.297297,0.589286,0.860667,-0.048129,0.767724,-0.246262
...,...,...,...,...,...,...,...,...,...,...
64,"(No, Y)",(Disclose Employment),0.480480,0.246246,0.159159,0.331250,1.345198,0.040843,1.127108,0.493947
65,"(Y, Disclose Employment)",(No),0.190691,0.732733,0.159159,0.834646,1.139086,0.019434,1.616331,0.150873
66,(No),"(Y, Disclose Employment)",0.732733,0.190691,0.159159,0.217213,1.139086,0.019434,1.033882,0.456858
67,(Disclose Employment),"(No, Y)",0.246246,0.480480,0.159159,0.646341,1.345198,0.040843,1.468986,0.340450


In [58]:
frequent_itemsets5 = apriori(df_encoded, min_support=0.01, use_colnames=True)

In [59]:
frequent_itemsets5

Unnamed: 0,support,itemsets
0,0.012012,(Accountants and Auditors)
1,0.015015,(Business Intelligence Analysts)
2,0.010511,(Chief Executives)
3,0.010511,(Computer Network Architects)
4,0.012012,"(Computer Occupations, All Other)"
...,...,...
64,0.154655,"(No, Y, Disclose Business and Employment)"
65,0.042042,"(Y, Yes, Disclose Business and Employment)"
66,0.054054,"(No, N, Disclose Employment)"
67,0.159159,"(No, Disclose Employment, Y)"


In [74]:
rules5 = association_rules(frequent_itemsets5, metric="confidence", min_threshold=0.1)

In [75]:
rules5

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Disclose Business),(N),0.504505,0.315315,0.207207,0.410714,1.302551,0.048129,1.161889,0.468775
1,(N),(Disclose Business),0.315315,0.504505,0.207207,0.657143,1.302551,0.048129,1.445195,0.339245
2,(No),(Disclose Business),0.732733,0.504505,0.315315,0.430328,0.852971,-0.054352,0.869791,-0.392077
3,(Disclose Business),(No),0.504505,0.732733,0.315315,0.625000,0.852971,-0.054352,0.712713,-0.258094
4,(Disclose Business),(Y),0.504505,0.684685,0.297297,0.589286,0.860667,-0.048129,0.767724,-0.246262
...,...,...,...,...,...,...,...,...,...,...
83,"(Disclose Employment, Yes)",(Y),0.033033,0.684685,0.031532,0.954545,1.394139,0.008914,6.936937,0.292369
84,"(Y, Yes)",(Disclose Employment),0.204204,0.246246,0.031532,0.154412,0.627062,-0.018753,0.891396,-0.427705
85,"(Y, Disclose Employment)",(Yes),0.190691,0.267267,0.031532,0.165354,0.618685,-0.019434,0.877897,-0.432319
86,(Yes),"(Y, Disclose Employment)",0.267267,0.190691,0.031532,0.117978,0.618685,-0.019434,0.917561,-0.456858
