# Fake Job Postings

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Libraries used
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score, f1_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder#, OneHotEncoder 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import matplotlib.pyplot as plt

# Data Cleaning

In [None]:
# We'll now upload the original preprocessed dataset
df = pd.read_csv('fake_job_postings.csv')

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
# We have to be carefull when selecting how to deal with Nans to avoid significant drop in minority data
a = df[~df.description.isna()]
b = df[~df.company_profile.isna()]
c = df[~df.requirements.isna()]
d = df[~df.benefits.isna()]
e = df[~df.isna()]
f = df[~df.salary_range.isna()]
g = df[~df.telecommuting.isna()]
h = df[~df.has_company_logo.isna()]
i = df[~df.has_questions.isna()]
j = df[~df.employment_type.isna()]
k = df[~df.required_education.isna()]
l = df[~df.required_experience.isna()]

print('Original:           ',df.shape, df[df.fraudulent == 1].shape, df[df.fraudulent == 0].shape)
print('Description:        ',a.shape, a[a.fraudulent == 1].shape, a[a.fraudulent == 0].shape)
print('Company Profile:    ',b.shape, b[b.fraudulent == 1].shape, b[b.fraudulent == 0].shape)
print('Requirements:       ',c.shape, c[c.fraudulent == 1].shape, c[c.fraudulent == 0].shape)
print('Benefits:           ',d.shape, d[d.fraudulent == 1].shape, d[d.fraudulent == 0].shape)
print('Salary Range:       ',f.shape, f[f.fraudulent == 1].shape, f[f.fraudulent == 0].shape)
print('Telecommuting:      ',g.shape, g[g.fraudulent == 1].shape, g[g.fraudulent == 0].shape)
print('Has Company Logo:   ',h.shape, h[h.fraudulent == 1].shape, h[h.fraudulent == 0].shape)
print('Has Questions:      ',i.shape, i[i.fraudulent == 1].shape, i[i.fraudulent == 0].shape)
print('Employment Type:    ',j.shape, j[j.fraudulent == 1].shape, j[j.fraudulent == 0].shape)
print('Required Education: ',k.shape, k[k.fraudulent == 1].shape, k[k.fraudulent == 0].shape)
print('Required Experience:',l.shape, l[l.fraudulent == 1].shape, l[l.fraudulent == 0].shape)
print('All:                ',e.shape, e[e.fraudulent == 1].shape, e[e.fraudulent == 0].shape)

In [None]:
# We'll fill all None values with 'Missing' 
df.fillna('Missing', inplace=True)

In [None]:
# Here we are going to compine all texts data into one columns to ease the NLP process.
df['texts'] = df['company_profile'] + '.. ' + df['description'] + '.. ' + df['requirements'] + '.. ' + df['benefits']

In [None]:
# We'll now drop all the columns that will not be needed.
# We'll also drop the columns including the texts data that we combined before.
df.drop(columns=['company_profile','description','requirements','benefits','location','title','industry','department','function','job_id'],inplace = True)
df.head()

In [None]:
df['salary_range']

In [None]:
# We'll check if there was a salary range in the data and we'll take the difference of the range. If there is no salary range, a zero value will be assigned.
sal_range = []
for i in df.salary_range:
    try: 
        sal_range.append(abs(int(i.split('-')[0]) - int(i.split('-')[1])))
    except:
        sal_range.append(0)
df['salary_range'] = sal_range
df['salary_range']

In [None]:
# We'll check now if there is 'Missing' value in any of the columns
print(df['telecommuting'].unique())
print(df['has_company_logo'].unique())
print(df['has_questions'].unique())
print(df['employment_type'].unique())
print(df['required_experience'].unique())
print(df['required_education'].unique())

In [None]:
# We'll convert the 'Missing' data in the employment_type column and will assign it to the 'Other' type.
df.employment_type.replace('Missing', 'Other', inplace = True)

In [None]:
# We'll convert the 'Missing' data in the required_experience column and will assign it to 'Not Applicable'.
df.required_experience.replace('Missing', 'Not Applicable', inplace = True)

In [None]:
# For the required_education column, we will join 'Vocational - Degree' and 'Vocational - HS Diploma' to 'Vocational'.
# And we'll convert the 'Missing' data in the column to be 'Unspecified'.
# We'll also convert 'Some High School Coursework' to 'High School or equivalent'.
df.required_education.replace('Vocational - Degree', 'Vocational', inplace = True)
df.required_education.replace('Vocational - HS Diploma', 'Vocational', inplace = True)
df.required_education.replace('Missing', 'Unspecified', inplace = True)
df.required_education.replace('Some High School Coursework', 'High School or equivalent', inplace = True)

In [None]:
# We will use OrdinalEncoder to convert categorical features into numerical data that can be used in models.
encoder = OrdinalEncoder()
df[['employment_type',
    'required_experience',
    'required_education']] = encoder.fit_transform(df[['employment_type',
                                                       'required_experience',
                                                       'required_education']])

## Dealing with Textual data

In [None]:
df.head(2)

Now, lets drop all stop words and stem our words and get it tokenized. There is two ways to do that. The two main ways are described below.

PorterStemmer does not often generate stems that are actual English words. It does not keep a lookup table for actual stems of the word but applies algorithmic rules to generate stems. It uses the rules to decide whether it is wise to strip a suffix.

The LancasterStemmer (Paice-Husk stemmer) is an iterative algorithm with rules saved externally. One table containing about 120 rules indexed by the last letter of a suffix. On each iteration, it tries to find an applicable rule by the last character of the word. Each rule specifies either a deletion or replacement of an ending. If there is no such rule, it terminates. It also terminates if a word starts with a vowel and there are only two letters left or if a word starts with a consonant and there are only three characters left. Otherwise, the rule is applied, and the process repeats.

In [None]:
# We will first convert all texts to lowercase 
df.apply(lambda x: x.astype(str).str.lower())

In [None]:
lancaster=LancasterStemmer()
stops = set(stopwords.words("english"))

def identify_tokens(row):
    review = row
    tokens = nltk.word_tokenize(review)
    token_words = [w for w in tokens if w.isalpha()]
    meaningful_words = [w for w in token_words if not w in stops]
    stemmed_list = [lancaster.stem(word) for word in meaningful_words]
    joined_words = (' '.join(stemmed_list))
    
    return joined_words

In [None]:
df['texts'] = df['texts'].apply(identify_tokens)

In [None]:
df.to_csv('Cleaned_data.csv') # cloud database

In [None]:
df = pd.read_csv('Cleaned_data.csv', index_col=0)

In [None]:
df.columns

In [None]:
X_ = pd.concat([df[df.fraudulent ==1],df.iloc[np.random.choice(df[df.fraudulent == 0].index, size=(1,5000))[0]]])
X_.head()

In [None]:
y = X_.fraudulent
X_.drop('fraudulent', axis=1, inplace = True)

In [None]:
# Now we will use the TfidfVectorizer().
tfidf = TfidfVectorizer(decode_error='ignore')
X = tfidf.fit_transform(df['texts'].values.astype('str'))

In [None]:
X = pd.DataFrame(X.toarray())

In [None]:
X[['salary_range',
   'telecommuting', 
   'has_company_logo', 
   'has_questions',
   'employment_type', 
   'required_experience', 
   'required_education',
   'fraudulent']] =  X_[['salary_range', 'telecommuting', 'has_company_logo', 
                         'has_questions','employment_type', 'required_experience', 
                         'required_education','fraudulent']]

In [None]:
X.head(1)

In [None]:
X.to_csv('X.csv')

In [None]:
df = pd.read_csv('X.csv', index_col=0)

In [None]:
df.head()

In [None]:
df.info()

# Model selection

## Create functions for our models

In [None]:
# Logistic Regression Model

def log_reg_model(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=10000)
    
#     param_grid = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
#                   'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#                   'C':range(1,1000)}
    param_grid = {'C':range(1,1000)}
    grid = GridSearchCV(model, param_grid, cv=10, scoring='recall')
    grid.fit(X_train, y_train)
    
    print("Best params:       ", grid.best_params_)
    print("Best estimator:    ", grid.best_estimator_)
    print("Best score:        ", grid.best_score_)
    
    final_model = grid.best_estimator_
    y_pred = final_model.predict(X_test)
    
    print('Score:             ', final_model.score(X_train, y_train))
    print('roc_auc_score:     ', roc_auc_score(y_test, y_pred))
    print('precision_score:   ', precision_score(y_test, y_pred))
    print('accuracy_score:    ', accuracy_score(y_test, y_pred))
    print('recall_score:      ', recall_score(y_test, y_pred))
    print('f1_score:          ', f1_score(y_test, y_pred))
    
    #return 

In [None]:
def knn_model(X_train, X_test, y_train, y_test):
    model = KNeighborsClassifier()
    k_range = list(range(1, 101))
    weight_options = ['uniform', 'distance']
    
    param_grid = dict(n_neighbors=k_range, weights=weight_options)
    
    grid = GridSearchCV(model, param_grid, cv=10, scoring='recall')
    grid.fit(X_train, y_train)
    
    print("Best params: ", grid.best_params_)
    print("Best estimator: ", grid.best_estimator_)
    print("Best score: ", grid.best_score_)
    
    knn = grid.best_estimator_
    y_pred = knn.predict(X_test)
    
    print('Score             :', knn.score(X_train, y_train))
    print('roc_auc_score     :', roc_auc_score(y_test, y_pred))
    print('precision_score   :', precision_score(y_test, y_pred))
    print('accuracy_score    :', accuracy_score(y_test, y_pred))
    print('recall_score      :', recall_score(y_test, y_pred))
    print('f1_score          :', f1_score(y_test, y_pred))

In [None]:
def D_T_C_model(X_train, X_test, y_train, y_test):
    
    dtc = DecisionTreeClassifier().fit(X_train,y_train)
    y_pred = dtc.predict(X_test)
    
    print('Score             :', dtc.score(X_train, y_train))
    print('roc_auc_score     :', roc_auc_score(y_test, y_pred))
    print('precision_score   :', precision_score(y_test, y_pred))
    print('accuracy_score    :', accuracy_score(y_test, y_pred))
    print('recall_score      :', recall_score(y_test, y_pred))
    print('f1_score          :', f1_score(y_test, y_pred))
    

In [None]:
def R_F_C_model(X_train, X_test, y_train, y_test):
    
    rfc = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
    y_pred = rfc.predict(X_test)
    
    print('Score             :', rfc.score(X_train, y_train))
    print('roc_auc_score     :', roc_auc_score(y_test, y_pred))
    print('precision_score   :', precision_score(y_test, y_pred))
    print('accuracy_score    :', accuracy_score(y_test, y_pred))
    print('recall_score      :', recall_score(y_test, y_pred))
    print('f1_score          :', f1_score(y_test, y_pred))
    

## Fitting Experiments

In [None]:
X = df.drop('fraudulent')
y = df.fraudulent
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
log_reg_model(X_train, X_test, y_train, y_test)

In [None]:
knn_model(X_train, X_test, y_train, y_test)

In [None]:
D_T_C_model(X_train, X_test, y_train, y_test)

In [None]:
R_F_C_model(X_train, X_test, y_train, y_test)