In [1]:
import warnings
warnings.simplefilter('ignore')
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

from tqdm import tqdm
tqdm.pandas()

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import auc, accuracy_score

import lightgbm as lgb

In [2]:
train = pd.read_csv('raw_data/train.csv')
test = pd.read_csv('raw_data/test.csv')

In [3]:
for col in ['benefits', 'company_profile', 'department', 'description',
            'employment_type', 'function', 'industry', 'location', 'required_education',
            'required_experience', 'requirements', 'title']:
    train[col] = train[col].str.lower()
    test[col] = test[col].str.lower()

In [4]:
def process(x):
    if x == 'nan':
        return 0
    else:
        return len(x.split())


for col in ['benefits', 'title', 'company_profile', 'description', 'requirements']:
    train[f'{col}_wordsLen'] = train[col].astype('str').apply(lambda x: process(x))
    test[f'{col}_wordsLen'] = test[col].astype('str').apply(lambda x: process(x))

In [5]:
def process1(x):
    if x == 'nan':
        return -999
    else:
        try:
            return int(x.split('-')[0])
        except:
            return -998
    
def process2(x):
    if x == 'nan':
        return -999
    else:
        try:
            return int(x.split('-')[1])
        except:
            return -998
    

train['salary_range_start'] = train['salary_range'].astype('str').apply(lambda x: process1(x))
test['salary_range_start'] = test['salary_range'].astype('str').apply(lambda x: process1(x))

train['salary_range_end'] = train['salary_range'].astype('str').apply(lambda x: process2(x))
test['salary_range_end'] = test['salary_range'].astype('str').apply(lambda x: process2(x))

del train['salary_range']
del test['salary_range']

In [6]:
df = pd.concat([train, test])
del train, test

for f in tqdm(['department', 'employment_type', 'function', 'industry',
               'location', 'required_education', 'required_experience', 'title']):
    lbl = LabelEncoder()
    df[f] = lbl.fit_transform(df[f].astype(str))

train = df[df['fraudulent'].notnull()].copy()
test = df[df['fraudulent'].isnull()].copy()

del df
gc.collect()

100%|██████████| 8/8 [00:00<00:00, 98.85it/s]


34

In [7]:
def get_tfidf(train, test, colname, max_features):

    text = list(train[colname].fillna('nan').values)
    tf = TfidfVectorizer(min_df=0, 
                         ngram_range=(1,2), 
                         stop_words='english', 
                         max_features=max_features)
    tf.fit(text)
    X = tf.transform(text)
    X_test = tf.transform(list(test[colname].fillna('nan').values))

    df_tfidf = pd.DataFrame(X.todense())
    df_tfidf_test = pd.DataFrame(X_test.todense())
    df_tfidf.columns = [f'{colname}_tfidf{i}' for i in range(max_features)]
    df_tfidf_test.columns = [f'{colname}_tfidf{i}' for i in range(max_features)]
    for col in df_tfidf.columns:
        train[col] = df_tfidf[col]
        test[col] = df_tfidf_test[col]
        
    return train, test


train, test = get_tfidf(train, test, 'benefits', 12)
train, test = get_tfidf(train, test, 'company_profile', 24)
train, test = get_tfidf(train, test, 'description', 48)
train, test = get_tfidf(train, test, 'requirements', 20)

In [8]:
to_drop = ['benefits', 'company_profile', 'description', 'requirements']

train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

In [9]:
train['id'] = train.index
test['id'] = test.index

train.shape, test.shape

((17680, 124), (200, 124))

In [10]:
train.head()

Unnamed: 0,benefits_wordsLen,company_profile_wordsLen,department,description_wordsLen,employment_type,fraudulent,function,has_company_logo,has_questions,industry,location,required_education,required_experience,requirements_wordsLen,salary_range_end,salary_range_start,telecommuting,title,title_wordsLen,benefits_tfidf0,benefits_tfidf1,benefits_tfidf2,benefits_tfidf3,benefits_tfidf4,benefits_tfidf5,benefits_tfidf6,benefits_tfidf7,benefits_tfidf8,benefits_tfidf9,benefits_tfidf10,benefits_tfidf11,company_profile_tfidf0,company_profile_tfidf1,company_profile_tfidf2,company_profile_tfidf3,company_profile_tfidf4,company_profile_tfidf5,company_profile_tfidf6,company_profile_tfidf7,company_profile_tfidf8,company_profile_tfidf9,company_profile_tfidf10,company_profile_tfidf11,company_profile_tfidf12,company_profile_tfidf13,company_profile_tfidf14,company_profile_tfidf15,company_profile_tfidf16,company_profile_tfidf17,company_profile_tfidf18,company_profile_tfidf19,company_profile_tfidf20,company_profile_tfidf21,company_profile_tfidf22,company_profile_tfidf23,description_tfidf0,description_tfidf1,description_tfidf2,description_tfidf3,description_tfidf4,description_tfidf5,description_tfidf6,description_tfidf7,description_tfidf8,description_tfidf9,description_tfidf10,description_tfidf11,description_tfidf12,description_tfidf13,description_tfidf14,description_tfidf15,description_tfidf16,description_tfidf17,description_tfidf18,description_tfidf19,description_tfidf20,description_tfidf21,description_tfidf22,description_tfidf23,description_tfidf24,description_tfidf25,description_tfidf26,description_tfidf27,description_tfidf28,description_tfidf29,description_tfidf30,description_tfidf31,description_tfidf32,description_tfidf33,description_tfidf34,description_tfidf35,description_tfidf36,description_tfidf37,description_tfidf38,description_tfidf39,description_tfidf40,description_tfidf41,description_tfidf42,description_tfidf43,description_tfidf44,description_tfidf45,description_tfidf46,description_tfidf47,requirements_tfidf0,requirements_tfidf1,requirements_tfidf2,requirements_tfidf3,requirements_tfidf4,requirements_tfidf5,requirements_tfidf6,requirements_tfidf7,requirements_tfidf8,requirements_tfidf9,requirements_tfidf10,requirements_tfidf11,requirements_tfidf12,requirements_tfidf13,requirements_tfidf14,requirements_tfidf15,requirements_tfidf16,requirements_tfidf17,requirements_tfidf18,requirements_tfidf19,id
0,82,114,49,104,2,0.0,23,1,1,86,788,6,6,72,-999,-999,0,6520,2,0.0,0.467817,0.517759,0.486184,0.0,0.0,0.0,0.0,0.526022,0.0,0.0,0.0,0.0,0.0,0.201,0.0,0.395883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.359037,0.232873,0.200779,0.0,0.0,0.0,0.0,0.0,0.261991,0.29288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.245878,0.0,0.364591,0.0,0.260746,0.325666,0.0,0.303185,0.0,0.0,0.0,0.0,0.0,0.0,0.312115,0.0,0.0,0.0,0.0,0.326553,0.0,0.0,0.0,0.310509,0.0,0.0,0.292853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.744641,0.0,0.667466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,10,79,431,165,1,0.0,12,1,1,22,1403,6,5,111,-999,-999,0,3679,3,0.0,0.0,0.0,0.656718,0.754137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.52961,0.0,0.0,0.472876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.467282,0.52683,0.0,0.0,0.0,0.0,0.0,0.116965,0.0,0.0,0.0,0.0,0.0,0.0,0.133494,0.38771,0.0,0.442036,0.0,0.39673,0.0,0.0,0.0,0.0,0.0,0.0,0.138828,0.134289,0.099286,0.0,0.0,0.0,0.247637,0.0,0.0,0.0,0.0,0.0,0.0,0.126252,0.0,0.0,0.0,0.0,0.0,0.545214,0.0,0.0,0.156471,0.0,0.111512,0.0,0.082373,0.0,0.501621,0.0,0.166066,0.0,0.175545,0.228821,0.423701,0.208787,0.21719,0.0,0.0,0.180581,0.0,0.27879,0.193091,0.185068,0.300812,0.189869,0.190502,0.150466,1
2,0,0,808,201,1,0.0,23,0,0,86,1979,1,6,38,130000,0,0,9367,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139291,0.0,0.153318,0.0,0.0,0.0,0.0,0.0,0.461716,0.15921,0.131603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236476,0.0,0.0,0.0,0.294905,0.0,0.0,0.149121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162321,0.328658,0.281607,0.186338,0.341494,0.0,0.337796,0.196193,0.136055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.429303,0.44658,0.721489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.309384,2
3,110,182,808,243,1,0.0,25,1,0,22,1445,1,0,255,-999,-999,0,7342,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.394999,0.638212,0.0,0.0,0.0,0.0,0.0,0.0,0.240403,0.0,0.207688,0.0,0.0,0.0,0.376103,0.0,0.0,0.196463,0.0,0.0,0.0,0.394565,0.0,0.0,0.210487,0.088168,0.088475,0.0,0.152748,0.0,0.0,0.354022,0.0,0.07568,0.0,0.135847,0.086617,0.0,0.085192,0.083578,0.064117,0.07755,0.190148,0.0,0.135989,0.0,0.0,0.0,0.678358,0.184414,0.100236,0.0,0.0,0.0,0.08139,0.0,0.0,0.0,0.078808,0.0,0.0,0.0,0.377999,0.0,0.107156,0.0,0.0,0.0,0.056412,0.078241,0.29246,0.183591,0.145233,0.0,0.307044,0.0,0.185273,0.365188,0.189943,0.0,0.51077,0.0,0.0,0.365723,0.168867,0.0,0.263074,0.0,0.0,0.263178,3
4,42,0,1029,134,4,0.0,32,0,1,72,2413,8,4,26,-999,-999,0,8243,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.09606,0.0,0.0,0.0,0.20913,0.120088,0.0,0.0,0.125351,0.103615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.279276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222864,0.0,0.11483,0.835844,0.0,0.0,0.0,0.0,0.0,0.110859,0.220064,0.0,0.0,0.0,0.077234,0.0,0.405656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263459,0.0,0.0,0.438102,0.571475,0.338182,0.0,0.0,0.364896,0.0,0.0,0.0,4


In [11]:
ycol = 'fraudulent'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.1,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=10,
                           reg_lambda=12,
                           random_state=1983,
                           is_unbalance=True,
                           metric='auc')


oof = []
prediction = test[['id']]
prediction['fraudulent'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1983)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['id', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction['fraudulent'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[228]	train's auc: 0.999957	valid's auc: 0.989111


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[210]	train's auc: 0.999951	valid's auc: 0.986598


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[220]	train's auc: 0.999971	valid's auc: 0.989948


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[205]	train's auc: 0.999956	valid's auc: 0.990655


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[221]	train's auc: 0.999976	valid's auc: 0.988312


In [12]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,location,379.8
1,industry,332.6
2,description_wordsLen,300.0
3,title,267.0
4,company_profile_wordsLen,186.4
5,requirements_wordsLen,184.8
6,department,165.2
7,function,144.4
8,benefits_wordsLen,140.4
9,description_tfidf46,91.4


In [13]:
df_oof = pd.concat(oof)

score = accuracy_score(df_oof[ycol].astype('int'), df_oof['pred'].astype('int'))
print('auc:', score)

auc: 0.9766968325791855


In [14]:
prediction.fraudulent.value_counts()

0.0    112
1.0     72
0.2      6
0.8      5
0.4      4
0.6      1
Name: fraudulent, dtype: int64

In [15]:
sub = prediction.copy(deep=True)
sub['fraudulent'] = sub['fraudulent'].apply(lambda x: 1 if x==1 else 0)

print(sub.fraudulent.value_counts())

sub.to_csv('submissions/{}.csv'.format(score), index=False, header=False, encoding='utf-8')

0    128
1     72
Name: fraudulent, dtype: int64
