In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn import preprocessing
import scipy
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
import time
import pickle
import copy

from sklearn.model_selection import KFold

In [3]:
def recall(confusion_mat):
    c10 = confusion_mat[1,0]
    c11 = confusion_mat[1,1]
    return c11 / (c10 + c11)

def accuracy(confusion_mat): # overall accuracy
    c00 = confusion_mat[0,0]
    c01 = confusion_mat[0,1]
    c10 = confusion_mat[1,0]
    c11 = confusion_mat[1,1]
    return (c00 + c11) / (c00 + c01 + c10 + c11)

In [4]:

# finds the probability threshold that gives you the best (highest) recall rate, given that 
# the overall accuracy must >= 'accuracy'
# uses K-fold cross validation
# X: data frame
# y: response vector
# returns lists that tell you the best threshold and the best recall in each round
def find_threshold(X, y, accuracy, K=10):
    # thresholds to be tried
    threshold = np.array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.13, 0.15, 0.2, 0.3, 0.4, 0.5])
    
    thresholds = []
    recalls = []
    
    kf = KFold(n_splits=K)
    for train_index, test_index in kf.split(X):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]        
        
        rf = RandomForestClassifier(n_estimators=200, max_features=None, n_jobs=6)
        rf.fit(X_train, y_train)
        
        best_recall = np.NINF
        best_threshold = .5
        for th in threshold:
            # classify as fraudulent if prob of fraud > th
            pred = (rf.predict_proba(X_test)[:, 1] > th).astype(int)
            confu = sklearn.metrics.confusion_matrix(y_test, pred)   
            
            if accuracy(confu) >= accuracy and recall(confu) > best_recall:
                best_recall = recall(confu)
                best_threshold = th
        
        thresholds.append(best_threshold)
        recalls.append(best_recall)
 
    return thresholds, recalls

        
        
        
        

In [5]:
# Sean's parsing function

def parsing(dat,train_dat):
    
    ### SALARY PROCESSING 
    # see if character is in text
    def alpha_in_text(text):
        return(any(c.isalpha() for c in text))

    # see how many dashes are in text
    def number_of_dashes(text):
        return(sum([1 for i in text if '-' in i]))

    # extract smallest salary range value
    def salary_extract_first(text):

        if pd.isna(text) is True:
            return(-1)

        elif alpha_in_text(text) is True:
            return(-2)

        elif '-' in text:
            if number_of_dashes(text) == 1:
                if re.split('-',text)[0].isdigit() is True:
                    return(float(re.split('-',text)[0]))
                else:
                    return(-1)

            else:
                return(-1)
        else:
            return(-1)

    # largest salary range value
    def salary_extract_second(text):

        if pd.isna(text) is True:
            return(-1)

        elif alpha_in_text(text) is True:
            return(-2)

        elif '-' in text:
            if number_of_dashes(text) == 1:
                if re.split('-',text)[1].isdigit() is True:
                    return(float(re.split('-',text)[1]))
                else:
                    return(-1)

            else:
                return(-1)
        else:
            return(-1)

    # convert numeric salary to category
    def salary_category_first(number):
        percentile = [60.0, 14000.0, 20000.0, 30000.0, 35000.0, 44374.4, 55000.0, 70000.0, 90000.0]
        if number == -1:
            return(str(1))

        if number == -2:
            return(str(2))

        for i in range(len(percentile)):
            if i not in {0,8}:
                if (number > percentile[i-1]) & (number <= percentile[i]):
                    return(str(i+3))
                else:
                    continue

            if i == 0:
                if number < percentile[0]:
                    return(str(i+3))
            if i == 8:
                if number >= percentile[8]:
                    return(str(i+3))



    def salary_category_second(number):
        percentile = [120, 20000.0, 30000.0, 40000.0, 50000.0, 65000.0, 80000.0, 100000.0, 130000.0]
        if number == -1:
            return(str(1))

        if number == -2:
            return(str(2))

        for i in range(len(percentile)):
            if i not in {0,8}:
                if (number > percentile[i-1]) & (number <= percentile[i]):
                    return(str(i+3))
                else:
                    continue

            if i == 0:
                if number < percentile[0]:
                    return(str(i+3))
            if i == 8:
                if number >= percentile[8]:
                    return(str(i+3))
    
    
    ### ONE HOT ENCODING (training)
    employment_type_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['employment_type']].fillna('NaN'))
    required_experience_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['required_experience']].fillna('NaN'))
    required_education_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['required_education']].fillna('NaN'))
    industry_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['industry']].fillna('NaN'))
    function_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['function.']].fillna('NaN'))
    category_1 = train_dat.salary_range.apply(salary_extract_first).apply(salary_category_first)
    category_2 = train_dat.salary_range.apply(salary_extract_second).apply(salary_category_second)
    salary_1_onehot = OneHotEncoder(handle_unknown='ignore').fit(pd.DataFrame(category_1))
    salary_2_onehot = OneHotEncoder(handle_unknown='ignore').fit(pd.DataFrame(category_2))
    
    ### OTHER PARSING
    nacols = dat.isna()[['title', 'location', 'department', 'salary_range','description', 'requirements', 'benefits',
                      'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
                      'required_experience', 'required_education', 'industry', 'function.']].astype('int')
    
    numeric_cols = dat[['telecommuting', 'has_company_logo', 'has_questions']]
    # func to count words in document
    document_word_count = lambda document: len(document.split(' '))
    
    # count words in column
    columns = ["company_profile","description","requirements","benefits"]
    df = copy.deepcopy(dat[columns])
    for column in columns:
            df[(str(column) + "_length")] = dat[column].apply(lambda x: len(x) if x == x else 0)
    
    
    # salary column one hot
    category_1 = dat.salary_range.apply(salary_extract_first).apply(salary_category_first)
    category_2 = dat.salary_range.apply(salary_extract_second).apply(salary_category_second)
    
    salary_1_transform = pd.DataFrame.sparse.from_spmatrix(salary_1_onehot.transform(pd.DataFrame(category_1)))
    salary_2_transform = pd.DataFrame.sparse.from_spmatrix(salary_2_onehot.transform(pd.DataFrame(category_2)))
    
    # transform to one hot
    employment_type_transformed =  pd.DataFrame.sparse.from_spmatrix(employment_type_onehot.transform(dat[['employment_type']].fillna('NaN')))
    required_experience_transformed =  pd.DataFrame.sparse.from_spmatrix(required_experience_onehot.transform(dat[['required_experience']].fillna('NaN')))
    required_education_transformed =  pd.DataFrame.sparse.from_spmatrix(required_education_onehot.transform(dat[['required_education']].fillna('NaN')))
    industry_transformed =  pd.DataFrame.sparse.from_spmatrix(industry_onehot.transform(dat[['industry']].fillna('NaN')))
    function_transformed =  pd.DataFrame.sparse.from_spmatrix(function_onehot.transform(dat[['function.']].fillna('NaN')))
    
    
    return(pd.concat([nacols,salary_1_transform, salary_2_transform,df.iloc[:,4:],
                      employment_type_transformed, required_experience_transformed, required_education_transformed, industry_transformed,function_transformed,numeric_cols],axis = 1))
    

In [6]:
dat_train = pd.read_csv("job_training_data.csv")
data = parsing(dat_train,dat_train)

In [7]:
X = data
y = dat_train['fraudulent']
X.head()

Unnamed: 0,title,location,department,salary_range,description,requirements,benefits,telecommuting,has_company_logo,has_questions,...,31,32,33,34,35,36,37,telecommuting.1,has_company_logo.1,has_questions.1
0,0,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
1,0,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,1
3,0,0,1,1,0,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0
4,0,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1


In [13]:
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(X):
    print(train_index)

[1073 1074 1075 ... 5359 5360 5361]
[   0    1    2 ... 5359 5360 5361]
[   0    1    2 ... 5359 5360 5361]
[   0    1    2 ... 5359 5360 5361]
[   0    1    2 ... 4287 4288 4289]


In [9]:
a = .5 * 10**np.arange(start=-3, stop=0.2, step=.2)
a

array([5.00000000e-04, 7.92446596e-04, 1.25594322e-03, 1.99053585e-03,
       3.15478672e-03, 5.00000000e-03, 7.92446596e-03, 1.25594322e-02,
       1.99053585e-02, 3.15478672e-02, 5.00000000e-02, 7.92446596e-02,
       1.25594322e-01, 1.99053585e-01, 3.15478672e-01, 5.00000000e-01])

In [23]:
a = [5,.4,.3,.2,.15,.13,.11,.10,.09,.08,.07,.06,.05,.04,.03,.02,.01]
print(a[::-1])

[0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.13, 0.15, 0.2, 0.3, 0.4, 5]


In [None]:
 current_threshold = threshold[-1]
        i = len(threshold) - 1
        while True:
             # classify as fraudulent if prob of fraud > threshold
            pred = (rf.predict_proba(X_test)[:, 1] > threshold[i]).astype(int)
            confu = sklearn.metrics.confusion_matrix(y_test, pred)          
            
            if accuracy(confu) < accuracy:
                return current_threshold, 
            
            current_threshold = threshold[i]            
            i = i - 1

In [28]:
df = pd.DataFrame(index=np.arange(1,11), columns=['threshold','recall'])
df

Unnamed: 0,threshold,recall
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,
10,,


In [29]:
a = []
a.append(1)
a

[1]