# Fake Job Postings

In [61]:
# Libraries used
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
# nltk.download('stopwords')
# nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler#, OneHotEncoder 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

In [32]:
# We'll now upload the preprocessed
df_pre = pd.read_csv('fake_job_postings.csv')

In [37]:
df_pre.head(2)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [36]:
df_pre.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [56]:
# We'll select the textual data to use in training
df = df_pre[['company_profile','description','requirements','benefits','fraudulent']]

# Data Cleaning 

In [57]:
a = df[~df.description.isna()]
b = df[~df.company_profile.isna()] 
c = df[~df.requirements.isna()]
d = df[~df.benefits.isna()]
e = df[~df.isna()]
print('Original:          ',df.shape, df[df.fraudulent == 1].shape, df[df.fraudulent == 0].shape)
print('description:       ',a.shape, a[a.fraudulent == 1].shape, a[a.fraudulent == 0].shape)
print('company_profile:   ',b.shape, b[b.fraudulent == 1].shape, b[b.fraudulent == 0].shape)
print('requirements:      ',c.shape, c[c.fraudulent == 1].shape, c[c.fraudulent == 0].shape)
print('benefits:          ',d.shape, d[d.fraudulent == 1].shape, d[d.fraudulent == 0].shape)
print('All:               ',e.shape, e[e.fraudulent == 1].shape, e[e.fraudulent == 0].shape)

Original:           (17880, 5) (866, 5) (17014, 5)
description:        (17879, 5) (865, 5) (17014, 5)
company_profile:    (14572, 5) (279, 5) (14293, 5)
requirements:       (15185, 5) (712, 5) (14473, 5)
benefits:           (10670, 5) (502, 5) (10168, 5)
All:                (17880, 5) (866, 5) (17014, 5)


We have to be carefull when selecting which columns to process and how to deal with Nans to avoid significant drop in minority data

In [58]:
# Lets fill the nan values with 'Missing'
df.fillna('Missing', inplace=True)

Now, lets drop all stop words and stem our words and get it tokenized

PorterStemmer does not often generate stems that are actual English words. It does not keep a lookup table for actual stems of the word but applies algorithmic rules to generate stems. It uses the rules to decide whether it is wise to strip a suffix.

The LancasterStemmer (Paice-Husk stemmer) is an iterative algorithm with rules saved externally. One table containing about 120 rules indexed by the last letter of a suffix. On each iteration, it tries to find an applicable rule by the last character of the word. Each rule specifies either a deletion or replacement of an ending. If there is no such rule, it terminates. It also terminates if a word starts with a vowel and there are only two letters left or if a word starts with a consonant and there are only three characters left. Otherwise, the rule is applied, and the process repeats.

In [4]:
df.head(3)

In [80]:
# We will first convert all texts to lowercase 
df.apply(lambda x: x.astype(str).str.lower())

Unnamed: 0,company_profile,description,requirements,benefits,fraudulent
0,"we're food52, and we've created a groundbreaki...","food52, a fast-growing, james beard award-winn...",experience with content management systems a m...,missing,0
1,"90 seconds, the worlds cloud video production ...",organised - focused - vibrant - awesome!do you...,what we expect from you:your key responsibilit...,what you will get from usthrough being part of...,0
2,valor services provides workforce solutions th...,"our client, located in houston, is actively se...",implement pre-commissioning and commissioning ...,missing,0
3,our passion for improving quality of life thro...,the company: esri – environmental systems rese...,"education: bachelor’s or master’s in gis, busi...",our culture is anything but corporate—we have ...,0
4,spotsource solutions llc is a global human cap...,job title: itemization review managerlocation:...,qualifications:rn license in the state of texa...,full benefits offered,0
...,...,...,...,...,...
17875,vend is looking for some awesome new talent to...,just in case this is the first time you’ve vis...,to ace this role you:will eat comprehensive st...,what can you expect from us?we have an open cu...,0
17876,weblinc is the e-commerce platform and service...,the payroll accountant will focus primarily on...,- b.a. or b.s. in accounting- desire to have f...,health &amp; wellnessmedical planprescription ...,0
17877,we provide full time permanent positions for m...,experienced project cost control staff enginee...,at least 12 years professional experience.abil...,missing,0
17878,missing,nemsia studios is looking for an experienced v...,1. must be fluent in the latest versions of co...,competitive salary (compensation will be based...,0


In [40]:
lancaster=LancasterStemmer()
stops = set(stopwords.words("english"))

def identify_tokens(row):
    review = row
    tokens = nltk.word_tokenize(review)
    token_words = [w for w in tokens if w.isalpha()]
    meaningful_words = [w for w in token_words if not w in stops]
    stemmed_list = [lancaster.stem(word) for word in meaningful_words]
    joined_words = (' '.join(stemmed_list))
    
    return joined_words

In [5]:
df['processed_company_profile'] = df['company_profile'].apply(identify_tokens)
df['processed_description'] = df['description'].apply(identify_tokens)
df['processed_requirements'] = df['requirements'].apply(identify_tokens)
df['processed_benefits'] = df['benefits'].apply(identify_tokens)

In [83]:
df.to_csv('Cleaned_data.csv')

In [41]:
df = pd.read_csv('Cleaned_data.csv', index_col=0)

In [42]:
df.columns

Index(['company_profile', 'description', 'requirements', 'benefits',
       'fraudulent', 'processed_company_profile', 'processed_description',
       'processed_requirements', 'processed_benefits'],
      dtype='object')

In [43]:
y = df['fraudulent']

In [12]:
# # TRY THIS AGAIN
# tfidf = TfidfVectorizer(decode_error='ignore')
# X = tfidf.fit_transform(df[['processed_company_profile',
#                             'processed_description',
#                             'processed_requirements',
#                             'processed_benefits']].values.reshape(1,-1).astype('str'))
# X.shape

In [44]:
tfidf = TfidfVectorizer(decode_error='ignore')
X1 = tfidf.fit_transform(df['processed_company_profile'].values.astype('str'))
X2 = tfidf.fit_transform(df['processed_description'].values.astype('str'))
X3 = tfidf.fit_transform(df['processed_requirements'].values.astype('str'))
X4 = tfidf.fit_transform(df['processed_benefits'].values.astype('str'))

# Model selection

## Create functions for our models

In [67]:
# Logistic Regression Model

def log_reg_model(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=10000)
    
#     param_grid = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
#                   'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#                   'C':range(1,1000)}
    param_grid = {'C':range(1,1000)}
    grid = GridSearchCV(model, param_grid, cv=10, scoring='recall')
    tqdm(grid.fit(X_train, y_train))
    
    print("Best params:       ", grid.best_params_)
    print("Best estimator:    ", grid.best_estimator_)
    print("Best score:        ", grid.best_score_)
    
    final_model = grid.best_estimator_
    y_pred = final_model.predict(X_test)
    
    print('Score:             ', final_model.score(X_train, y_train))
    print('roc_auc_score:     ', roc_auc_score(y_test, y_pred))
    print('precision_score:   ', precision_score(y_test, y_pred))
    print('accuracy_score:    ', accuracy_score(y_test, y_pred))
    print('recall_score:      ', recall_score(y_test, y_pred))
    print('f1_score:          ', f1_score(y_test, y_pred))
    
    #return 

In [68]:
def knn_model(X_train, X_test, y_train, y_test):
    model = KNeighborsClassifier()
    k_range = list(range(1, 101))
    weight_options = ['uniform', 'distance']
    
    param_grid = dict(n_neighbors=k_range, weights=weight_options)
    
    grid = GridSearchCV(model, param_grid, cv=10, scoring='recall')
    grid.fit(X_train, y_train)
    
    print("Best params: ", grid.best_params_)
    print("Best estimator: ", grid.best_estimator_)
    print("Best score: ", grid.best_score_)
    
    knn = grid.best_estimator_
    y_pred = knn.predict(X_test)
    
    print('Score             :', knn.score(X_train, y_train))
    print('roc_auc_score     :', roc_auc_score(y_test, y_pred))
    print('precision_score   :', precision_score(y_test, y_pred))
    print('accuracy_score    :', accuracy_score(y_test, y_pred))
    print('recall_score      :', recall_score(y_test, y_pred))
    print('f1_score          :', f1_score(y_test, y_pred))

In [69]:
def D_T_C_model(X_train, X_test, y_train, y_test):
    
    dtc = DecisionTreeClassifier().fit(X_train,y_train)
    y_pred = dtc.predict(X_test)
    
    print('Score             :', dtc.score(X_train, y_train))
    print('roc_auc_score     :', roc_auc_score(y_test, y_pred))
    print('precision_score   :', precision_score(y_test, y_pred))
    print('accuracy_score    :', accuracy_score(y_test, y_pred))
    print('recall_score      :', recall_score(y_test, y_pred))
    print('f1_score          :', f1_score(y_test, y_pred))
    

In [88]:
def R_F_C_model(X_train, X_test, y_train, y_test):
    
    rfc = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
    y_pred = rfc.predict(X_test)
    
    print('Score             :', rfc.score(X_train, y_train))
    print('roc_auc_score     :', roc_auc_score(y_test, y_pred))
    print('precision_score   :', precision_score(y_test, y_pred))
    print('accuracy_score    :', accuracy_score(y_test, y_pred))
    print('recall_score      :', recall_score(y_test, y_pred))
    print('f1_score          :', f1_score(y_test, y_pred))
    

## Trying company_profile (X1)

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2)

In [87]:
log_reg_model(X_train, X_test, y_train, y_test)

In [None]:
knn_model(X_train, X_test, y_train, y_test)

In [72]:
D_T_C_model(X_train, X_test, y_train, y_test)

Score             : 0.9668624161073825
roc_auc_score     : 0.6450617283950617
precision_score   : 1.0
accuracy_score    : 0.9678411633109619
recall_score      : 0.29012345679012347
f1_score          : 0.44976076555023925


In [92]:
R_F_C_model(X_train, X_test, y_train, y_test)

Score             : 0.9667925055928411
roc_auc_score     : 0.6512345679012346
precision_score   : 1.0
accuracy_score    : 0.968400447427293
recall_score      : 0.30246913580246915
f1_score          : 0.46445497630331756


## Trying description (X2)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2)

In [None]:
log_reg_model(X_train, X_test, y_train, y_test)

In [None]:
knn_model(X_train, X_test, y_train, y_test)

In [77]:
D_T_C_model(X_train, X_test, y_train, y_test)

Score             : 0.9999300894854586
roc_auc_score     : 0.7775912838633687
precision_score   : 0.723404255319149
accuracy_score    : 0.9672818791946308
recall_score      : 0.5666666666666667
f1_score          : 0.6355140186915887


In [None]:
R_F_C_model(X_train, X_test, y_train, y_test)

## Trying requirements (X3)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X3, y, test_size=0.2)

In [None]:
log_reg_model(X_train, X_test, y_train, y_test)

In [None]:
knn_model(X_train, X_test, y_train, y_test)

In [80]:
D_T_C_model(X_train, X_test, y_train, y_test)

Score             : 0.990981543624161
roc_auc_score     : 0.7356473879462385
precision_score   : 0.6829268292682927
accuracy_score    : 0.9639261744966443
recall_score      : 0.4827586206896552
f1_score          : 0.5656565656565657


In [81]:
R_F_C_model(X_train, X_test, y_train, y_test)

Score             : 0.990981543624161
roc_auc_score     : 0.7182438322285066
precision_score   : 0.987012987012987
accuracy_score    : 0.9723154362416108
recall_score      : 0.4367816091954023
f1_score          : 0.6055776892430279


## Trying benefits (X4)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X4, y, test_size=0.2)

In [None]:
log_reg_model(X_train, X_test, y_train, y_test)

In [None]:
knn_model(X_train, X_test, y_train, y_test)

In [83]:
D_T_C_model(X_train, X_test, y_train, y_test)

Score             : 0.9778383668903803
roc_auc_score     : 0.7287684439973173
precision_score   : 0.7722772277227723
accuracy_score    : 0.968400447427293
recall_score      : 0.4642857142857143
f1_score          : 0.5799256505576209


In [84]:
R_F_C_model(X_train, X_test, y_train, y_test)

Score             : 0.9778383668903803
roc_auc_score     : 0.7309691482226693
precision_score   : 0.9069767441860465
accuracy_score    : 0.9725950782997763
recall_score      : 0.4642857142857143
f1_score          : 0.6141732283464567


In [86]:
# Add features together, get best compination
# Add features from df_pre
# Try different models