In [53]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import itertools 
import os
import seaborn as sns
import operator
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import nltk.corpus # sample text for performing tokenization
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.preprocessing import OneHotEncoder
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yunjaecho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
# Sean's parsing function

def parsing(dat,train_dat):
    
    ### SALARY PROCESSING 
    # see if character is in text
    def alpha_in_text(text):
        return(any(c.isalpha() for c in text))

    # see how many dashes are in text
    def number_of_dashes(text):
        return(sum([1 for i in text if '-' in i]))

    # extract smallest salary range value
    def salary_extract_first(text):

        if pd.isna(text) is True:
            return(-1)

        elif alpha_in_text(text) is True:
            return(-2)

        elif '-' in text:
            if number_of_dashes(text) == 1:
                if re.split('-',text)[0].isdigit() is True:
                    return(float(re.split('-',text)[0]))
                else:
                    return(-1)

            else:
                return(-1)
        else:
            return(-1)

    # largest salary range value
    def salary_extract_second(text):

        if pd.isna(text) is True:
            return(-1)

        elif alpha_in_text(text) is True:
            return(-2)

        elif '-' in text:
            if number_of_dashes(text) == 1:
                if re.split('-',text)[1].isdigit() is True:
                    return(float(re.split('-',text)[1]))
                else:
                    return(-1)

            else:
                return(-1)
        else:
            return(-1)

    # convert numeric salary to category
    def salary_category_first(number):
        percentile = [60.0, 14000.0, 20000.0, 30000.0, 35000.0, 44374.4, 55000.0, 70000.0, 90000.0]
        if number == -1:
            return(str(1))

        if number == -2:
            return(str(2))

        for i in range(len(percentile)):
            if i not in {0,8}:
                if (number > percentile[i-1]) & (number <= percentile[i]):
                    return(str(i+3))
                else:
                    continue

            if i == 0:
                if number < percentile[0]:
                    return(str(i+3))
            if i == 8:
                if number >= percentile[8]:
                    return(str(i+3))



    def salary_category_second(number):
        percentile = [120, 20000.0, 30000.0, 40000.0, 50000.0, 65000.0, 80000.0, 100000.0, 130000.0]
        if number == -1:
            return(str(1))

        if number == -2:
            return(str(2))

        for i in range(len(percentile)):
            if i not in {0,8}:
                if (number > percentile[i-1]) & (number <= percentile[i]):
                    return(str(i+3))
                else:
                    continue

            if i == 0:
                if number < percentile[0]:
                    return(str(i+3))
            if i == 8:
                if number >= percentile[8]:
                    return(str(i+3))
    
    
    ### ONE HOT ENCODING (training)
    employment_type_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['employment_type']].fillna('NaN'))
    required_experience_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['required_experience']].fillna('NaN'))
    required_education_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['required_education']].fillna('NaN'))
    industry_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['industry']].fillna('NaN'))
    function_onehot = OneHotEncoder(handle_unknown='ignore').fit(train_dat[['function.']].fillna('NaN'))
    category_1 = train_dat.salary_range.apply(salary_extract_first).apply(salary_category_first)
    category_2 = train_dat.salary_range.apply(salary_extract_second).apply(salary_category_second)
    salary_1_onehot = OneHotEncoder(handle_unknown='ignore').fit(pd.DataFrame(category_1))
    salary_2_onehot = OneHotEncoder(handle_unknown='ignore').fit(pd.DataFrame(category_2))
    
    ### OTHER PARSING
    nacols = dat.isna()[['title', 'location', 'department', 'salary_range','description', 'requirements', 'benefits',
                      'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
                      'required_experience', 'required_education', 'industry', 'function.']].astype('int')
    
    numeric_cols = dat[['telecommuting', 'has_company_logo', 'has_questions']]
    # func to count words in document
    document_word_count = lambda document: len(document.split(' '))
    
    # count words in column
    columns = ["company_profile","description","requirements","benefits"]
    df = copy.deepcopy(dat[columns])
    for column in columns:
            df[(str(column) + "_length")] = dat[column].apply(lambda x: len(x) if x == x else 0)
    
    
    # salary column one hot
    category_1 = dat.salary_range.apply(salary_extract_first).apply(salary_category_first)
    category_2 = dat.salary_range.apply(salary_extract_second).apply(salary_category_second)
    
    salary_1_transform = pd.DataFrame.sparse.from_spmatrix(salary_1_onehot.transform(pd.DataFrame(category_1)))
    salary_2_transform = pd.DataFrame.sparse.from_spmatrix(salary_2_onehot.transform(pd.DataFrame(category_2)))
    
    # transform to one hot
    employment_type_transformed =  pd.DataFrame.sparse.from_spmatrix(employment_type_onehot.transform(dat[['employment_type']].fillna('NaN')))
    required_experience_transformed =  pd.DataFrame.sparse.from_spmatrix(required_experience_onehot.transform(dat[['required_experience']].fillna('NaN')))
    required_education_transformed =  pd.DataFrame.sparse.from_spmatrix(required_education_onehot.transform(dat[['required_education']].fillna('NaN')))
    industry_transformed =  pd.DataFrame.sparse.from_spmatrix(industry_onehot.transform(dat[['industry']].fillna('NaN')))
    function_transformed =  pd.DataFrame.sparse.from_spmatrix(function_onehot.transform(dat[['function.']].fillna('NaN')))
    
    
    return(pd.concat([nacols,salary_1_transform, salary_2_transform,df.iloc[:,4:],
                      employment_type_transformed, required_experience_transformed, required_education_transformed, industry_transformed,function_transformed,numeric_cols],axis = 1))
    

In [28]:
dat_train = pd.read_csv("job_training_data.csv")
dat_train2 = pd.read_csv("practice_job_verification_data.csv")
X1 = parsing(dat_train,dat_train)
X2 = parsing(dat_train2,dat_train)
y1 = dat_train["fraudulent"]
y2 = dat_train2["fraudulent"]

NameError: name 'OneHotEncoder' is not defined

In [None]:
def length_counter(df,columns = ["company_profile","description","requirements","benefits"]):
    length = []
    for column in columns:
        df[(str(column) + "_length")] = df[column].apply(lambda x: len(x) if x == x else 0)
    return df

In [None]:
jobs = pd.read_csv("job_training_data.csv")
jobs = length_counter(jobs)
nf_jobs = jobs[jobs["fraudulent"] == 0]
f_jobs = jobs[jobs["fraudulent"] == 1]

# frequency selection

In [29]:
stop_words = [
    
    ".",
    'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves',"us"]


In [30]:
def strips(x):
    return (x.strip().strip("'").strip('"').strip(",").strip("(").strip(")").strip(".").strip(";").strip(":"))

def merge_and_tokenize(df,column = "company_profile",k = 0):
    merged_text = []
    text_dict = {}
    parsed_row = []
    for sentence in df[column]:
        if sentence == sentence:
            words = sentence.lower().split()
            words = [strips(w) for w in words if strips(w) not in stop_words]
            for word in words: # remove stop words
                if "url" in str(word):
                    words.remove(word)

            parsed_row.append(words)
            merged_text.extend(words)
        else:
            parsed_row.append(np.nan)
            
    fdist = FreqDist(merged_text)
    
    keys = list(fdist.keys())
    
    for key in keys:
        if fdist[key] <= k:
            del fdist[key]
    total_counts = sum(fdist.values())
    for key in fdist:
        fdist[key] = [fdist[key],fdist[key]/total_counts]
    return fdist,parsed_row

def fraud_frequency(fraud_dict,parsed_rows):
    freq_list = []
    
    for text in tqdm(parsed_rows):
        parsed_dict = {key:0 for key in fraud_dict.keys()}
        if text == text:
            for word in text:
                if word in parsed_dict:
                    parsed_dict[word] += 1/len(text)
        else:
            freq_list.append(parsed_dict)
            continue
        
        freq_list.append(parsed_dict)
        
    return pd.DataFrame(freq_list)

def fraud_freq_dictionary(fraud_dict,nfraud_dict, th_percentage = 0.2,tops = 5):
    differenced_farud = {}
    for key in fraud_dict.keys():
        if key in nfraud_dict.keys():
            if abs(fraud_dict[key][1] - nfraud_dict[key][1]) >= fraud_dict[key][1]*th_percentage:
                differenced_farud[key] = fraud_dict[key]
        if key not in nfraud_dict.keys():
            differenced_farud[key] = fraud_dict[key]
                   
    sorted_differenced_fraud = sorted(differenced_farud.items(), key=operator.itemgetter(1),reverse = True)
    #out = dict(itertools.islice(sorted_differenced_farud.items(), tops)) 
    out = dict(sorted_differenced_fraud[:tops])
    return out

In [None]:
x,y =merge_and_tokenize(jobs,k =5)

In [None]:
len(x)

In [31]:
def fraud_freq_generator(train,test,column,th_percentage,th_count,tops,train_parsing = True):
    if train_parsing:
        _,parsed_rows = merge_and_tokenize(train,column,th_count*5)
        fraud_dict,_ = merge_and_tokenize(train[train["fraudulent"] == 1],column ,th_count)
        nfraud_dict,_ = merge_and_tokenize(train[train["fraudulent"] == 0],column ,th_count*5)
        filtered_fraud_dict = fraud_freq_dictionary(fraud_dict,nfraud_dict,th_percentage,tops)
        fraud_freq = fraud_frequency(filtered_fraud_dict,parsed_rows)
    else:
        _,parsed_rows = merge_and_tokenize(test,column,th_count*5)
        fraud_dict,_ = merge_and_tokenize(train[train["fraudulent"] == 1],column ,th_count)
        nfraud_dict,_ = merge_and_tokenize(train[train["fraudulent"] == 0],column ,th_count*5)
        filtered_fraud_dict = fraud_freq_dictionary(fraud_dict,nfraud_dict,th_percentage,tops)
        fraud_freq = fraud_frequency(filtered_fraud_dict,parsed_rows)
    
    return fraud_freq

In [48]:
train_data = pd.read_csv("job_training_data.csv")
test_data = pd.read_csv("job_verification_data.csv")

In [33]:
train_data = length_counter(train_data)
test_data = length_counter(test_data)

In [34]:
train_len_df = train_data.loc[:,["company_profile_length", "description_length","requirements_length","benefits_length"]]
test_len_df = test_data.loc[:,["company_profile_length", "description_length","requirements_length","benefits_length"]]

In [49]:
company_freq = fraud_freq_generator(train_data,test_data,"company_profile",th_percentage = 0.30, th_count = 15, tops = 5)
description_freq = fraud_freq_generator(train_data,test_data,"description",th_percentage = 0.30, th_count = 15, tops = 10)
requirements_freq = fraud_freq_generator(train_data,test_data,"requirements",th_percentage = 0.30,th_count = 15, tops = 10)
benefits_freq = fraud_freq_generator(train_data,test_data,"benefits",th_percentage = 0.30, th_count = 15, tops = 10)

big_matrix = pd.concat([company_freq,description_freq,requirements_freq,benefits_freq,train_len_df],axis=1)
big_matrix["fraudulent"] = train_data["fraudulent"]

100%|███████████████████████████████████| 5362/5362 [00:00<00:00, 267777.85it/s]
100%|███████████████████████████████████| 5362/5362 [00:00<00:00, 144548.44it/s]
100%|███████████████████████████████████| 5362/5362 [00:00<00:00, 254883.02it/s]
100%|███████████████████████████████████| 5362/5362 [00:00<00:00, 401232.04it/s]


In [None]:
feature_matrix = big_matrix


X = feature_matrix.drop("fraudulent", axis = 1)
y = feature_matrix.fraudulent
y = y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =999999999)
rfm = RandomForestClassifier()
rfm = RandomForestClassifier(n_estimators = 200, max_features=None ,oob_score=True,n_jobs=6)
rfm.fit(X_train,y_train)
#y_pred_rfm=rfm.predict(X_test)
y_pred_rfm = (rfm.predict_proba(X_test)[:, 1] > 0.15).astype(int)
print(confusion_matrix(y_test,y_pred_rfm))
print(accuracy_score(y_test,y_pred_rfm))
print(recall_score(y_test,y_pred_rfm))

In [51]:
company_freq2 = fraud_freq_generator(train_data,test_data,"company_profile",th_percentage = 0.30, th_count = 15, tops = 5,train_parsing =False)
description_freq2 = fraud_freq_generator(train_data,test_data,"description",th_percentage = 0.30, th_count = 15, tops = 5,train_parsing =False)
requirements_freq2 = fraud_freq_generator(train_data,test_data,"requirements",th_percentage = 0.30,th_count = 15, tops = 5,train_parsing =False)
benefits_freq2 = fraud_freq_generator(train_data,test_data,"benefits",th_percentage = 0.30, th_count = 15, tops = 5,train_parsing = False)

big_matrix2 = pd.concat([company_freq2,description_freq2,requirements_freq2,benefits_freq2,test_len_df],axis=1)
big_matrix2["fraudulent"] = test_data["fraudulent"]

100%|███████████████████████████████████| 1000/1000 [00:00<00:00, 328732.97it/s]
100%|███████████████████████████████████| 1000/1000 [00:00<00:00, 183453.79it/s]
100%|███████████████████████████████████| 1000/1000 [00:00<00:00, 320616.42it/s]
100%|███████████████████████████████████| 1000/1000 [00:00<00:00, 599786.07it/s]


In [52]:
big_matrix2

Unnamed: 0,business,candidates,recruiting,bonus,services,work,-,&amp,company,equipment,...,benefits,paid,company.1,training,-.1,company_profile_length,description_length,requirements_length,benefits_length,fraudulent
0,0.000000,0.0,0.0,0.0,0.000000,0.018519,0.000000,0.000000,0.018519,0.000000,...,0.000000,0.0,0.000000,0.00000,0.0,829,630,70,17,0
1,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.04878,0.0,651,216,295,430,0
2,0.000000,0.0,0.0,0.0,0.000000,0.005000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.00000,0.0,0,2474,0,113,0
3,0.000000,0.0,0.0,0.0,0.000000,0.007018,0.003509,0.007018,0.000000,0.000000,...,0.000000,0.0,0.000000,0.00000,0.0,812,3178,0,0,0
4,0.041667,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.022727,0.0,0.022727,0.00000,0.0,282,845,443,506,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.031250,0.0,0.0,0.0,0.015625,0.000000,0.000000,0.008333,0.000000,0.000000,...,0.000000,0.0,0.000000,0.00000,0.0,684,1380,705,0,0
996,0.000000,0.0,0.0,0.0,0.009901,0.003205,0.003205,0.019231,0.009615,0.000000,...,0.000000,0.0,0.000000,0.00000,0.0,1096,3201,329,0,0
997,0.000000,0.0,0.0,0.0,0.000000,0.013158,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.00000,0.0,704,1657,610,627,0
998,0.031250,0.0,0.0,0.0,0.015625,0.000000,0.000000,0.000000,0.000000,0.007407,...,0.000000,0.0,0.000000,0.00000,0.0,684,1543,711,0,0


In [43]:
train_matrix = big_matrix
test_matrix = big_matrix2

X_train = train_matrix.drop("fraudulent", axis = 1)
X_train=(X_train-X_train.mean())/X_train.std()
y_train = train_matrix.fraudulent.astype(int)

X_test = test_matrix.drop("fraudulent", axis = 1)
X_test=(X_test-X_test.mean())/X_test.std()
y_test = test_matrix.fraudulent.astype(int)

rfm = RandomForestClassifier()
rfm = RandomForestClassifier(n_estimators = 200, max_features=None ,oob_score=True,n_jobs=6)
rfm.fit(X_train,y_train)
#y_pred_rfm=rfm.predict(X_test)
y_pred_rfm = (rfm.predict_proba(X_test)[:, 1] > 0.3).astype(int)
print(confusion_matrix(y_test,y_pred_rfm))
print(accuracy_score(y_test,y_pred_rfm))
print(recall_score(y_test,y_pred_rfm))

ValueError: X has 24 features, but DecisionTreeClassifier is expecting 39 features as input.

In [45]:
big_matrix

Unnamed: 0,business,candidates,recruiting,bonus,services,work,-,&amp,company,equipment,...,environment,competitive,time,working,opportunity,company_profile_length,description_length,requirements_length,benefits_length,fraudulent
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.012048,0.0,...,0.0,0.125,0.000000,0.000000,0.000000,2559,871,1383,100,0
1,0.0,0.000000,0.000000,0.000000,0.000000,0.028571,0.000000,0.0,0.000000,0.0,...,0.0,0.000,0.000000,0.000000,0.000000,829,391,88,16,0
2,0.0,0.000000,0.000000,0.000000,0.000000,0.031250,0.000000,0.0,0.000000,0.0,...,0.0,0.000,0.000000,0.011050,0.005525,1251,724,805,2047,0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.038462,0.000000,0.0,0.000000,0.0,...,0.0,0.000,0.000000,0.000000,0.000000,829,838,0,0,0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000,0.000000,0.000000,0.000000,0,92,35,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5357,0.0,0.000000,0.000000,0.000000,0.000000,0.019231,0.192308,0.0,0.000000,0.0,...,0.0,0.000,0.000000,0.000000,0.000000,0,421,651,10,1
5358,0.0,0.000000,0.000000,0.000000,0.000000,0.014286,0.000000,0.0,0.014286,0.0,...,0.0,0.000,0.000000,0.000000,0.000000,0,874,0,0,1
5359,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.047619,0.0,0.000000,0.0,...,0.0,0.000,0.000000,0.000000,0.000000,0,336,35,0,1
5360,0.0,0.029126,0.029126,0.029126,0.009709,0.024691,0.000000,0.0,0.012346,0.0,...,0.0,0.000,0.016304,0.005435,0.005435,1099,930,418,2198,1
