In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv(path)
df.head(5)

In [None]:
import matplotlib.pyplot as plt

df.company_profile = df.company_profile.notnull().astype('int')
df['has_salary_range'] = df.salary_range.notnull().astype('int') 

f_df = df[df.fraudulent == True]
r_df = df[df.fraudulent == False]

Feature Extraction

In [None]:
import seaborn as sns
def feature_extraction(column, normalize):
    real = round(((r_df[column].sum() / len(r_df)) * 100), 2)
    fake = round(((f_df[column].sum() / len(f_df)) * 100), 2)
    print(f'Feature : {column}  real jobs percentage: {real}  fake jobs percentage: {fake}')
    x =  pd.DataFrame({"real_jobs": real, "fake_jobs" : fake}, index = [0])
    sns.barplot(data = x, label = column)
    
    if normalize is True:
        plt.yticks(np.arange(0,110, 10))
    
feature_extraction(column = "has_company_logo", normalize= True)

In [None]:
feature_extraction(column = "telecommuting", normalize= False)

In [None]:
feature_extraction(column = "has_questions", normalize= False)

In [None]:
feature_extraction(column = "company_profile", normalize= True)

In [None]:
feature_extraction(column = "has_salary_range", normalize= True)

In [None]:
irrelevant_columns = ['job_id', 'location', 'title', 'department', 'salary_range']

In [None]:
# r_df = r_df.fillna('NO VALUE')
# f_df = f_df.fillna('NO VALUE')

In [None]:
def feature_table(column_name):
    r_func_df = pd.DataFrame(r_df[column_name].value_counts())
    r_func_df['%_real_jobs'] = (r_func_df[column_name] / len(r_df)) * 100
    r_func_df = r_func_df.drop(column_name, axis= 1)


    f_func_df = pd.DataFrame(f_df[column_name].value_counts())
    f_func_df['%_fake_jobs'] = (f_func_df[column_name] / len(f_df)) * 100
    f_func_df = f_func_df.drop(column_name, axis= 1)

    job_func_df = r_func_df.join(f_func_df)
    job_func_df['diff'] = abs(r_func_df['%_real_jobs'] - f_func_df['%_fake_jobs'] )
    print(job_func_df.sort_values(by= 'diff', ascending= False))
    
    
    

In [None]:
feature_table(column_name = 'function')

In [None]:
feature_table(column_name = 'industry')

In [None]:
feature_table(column_name = 'required_experience')

In [None]:
feature_table(column_name = 'required_education')

In [None]:
feature_table(column_name = 'title')

In [None]:
feature_table(column_name = 'employment_type')

In [None]:
irrelevant_columns = ['job_id', 'location', 'title', 'department', 'salary_range' ]
df.drop(irrelevant_columns, axis =1 , inplace= True)

In [None]:
df.required_education = df.required_education.fillna(0)
dict = {0:0}
a = 5
for i in df.required_education.unique():
    dict.update({i: a})
    a = a+1

df.required_education = df.required_education.map(dict)



In [None]:
relevant_columns = ['company_profile', 'telecommuting', 'has_company_logo', 'has_questions', 'has_salary_range', 'fraudulent', 'required_education']
binary_df = df[relevant_columns]
binary_df.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
import warnings

warnings.filterwarnings("ignore", category = FutureWarning) 



features = binary_df.drop('fraudulent', axis=1)
labels = binary_df['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.4, random_state = 42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

In [None]:
rf = RandomForestClassifier()

scores = cross_val_score(rf, X_train, y_train, cv = 5)
scores

In [None]:
 def print_results(results):
    print(f'BEST PARAMS: {results.best_params_}')
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{round(mean, 3)} (+/-{std + 2, 3}) for {params}')

In [None]:
rf = RandomForestClassifier()
parameters = {
    'n_estimators' : [5, 50, 100],
    'max_depth': [2, 10, 20, None]
}

cv = GridSearchCV(rf, parameters, cv = 5)
cv.fit(X_train, y_train)

print_results(cv)

In [None]:
rf1 = RandomForestClassifier(n_estimators= 50, max_depth= 2)
rf1.fit(X_train, y_train)

rf2 = RandomForestClassifier(n_estimators= 100, max_depth= 20)
rf2.fit(X_train, y_train)

rf3 = RandomForestClassifier(n_estimators= 103, max_depth= 10)
rf3.fit(X_train, y_train)


for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred),3)
    precision = round(precision_score(y_val, y_pred),3)
    recall = round(recall_score(y_val, y_pred),3)
    print(f'MAX DEPTH: {mdl.max_depth}, # of EST: {mdl.n_estimators}, A: {accuracy}, P: {precision}, R:{recall}')
    

In [None]:
y_pred = rf3.predict(X_test)
accuracy =  round(accuracy_score(y_test, y_pred),3)
precision = round(precision_score(y_test, y_pred),3)
recall =    round(recall_score(y_test, y_pred),3)

print(f'MAX DEPTH: {rf3.max_depth}, # of EST: {rf3.n_estimators}, A: {accuracy}, P: {precision}, R:{recall}')


In [None]:
#model
'''
from sklearn.model_selection import RandomizedSearchCV
MOD = RandomForestClassifier() 
#Implemente RandomSearchCV
m_params = { 
            "RF": {
                    "n_estimators" : np.linspace(2, 500, 500, dtype = "int"),  
                    "max_depth": [5, 20, 30, None], 
                    "min_samples_split": np.linspace(2, 50, 50, dtype = "int"),  
                    "max_features": ["sqrt", "log2",10, 20, None],
                    "oob_score": [True],
                    "bootstrap": [True]
                    },
            }
scoreFunction = {"recall": "recall", "precision": "precision"}
random_search = RandomizedSearchCV(MOD,
                                   param_distributions = m_params['RF'], 
                                   n_iter = 20,
                                   scoring = scoreFunction,               
                                   refit = "recall",
                                   return_train_score = True,
                                   random_state = 42,
                                   cv = 5)
                            #       verbose = 1 + int(log)) 

#trains and optimizes the model
random_search.fit(X_train, y_train)

#recover the best model
MOD = random_search.best_estimator_
'''