# BT4012 Without Text Analysis

### Setting up the environment

In [6]:
# libraries importing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [7]:
# setting up
# import packages here
# from google.colab import drive
# drive.mount('/content/drive')
# url = '/content/drive/MyDrive/fake_job_postings.csv'

# import from github repo
url = 'https://raw.githubusercontent.com/LordZhiHao/BT4012_Fraud_Analytics_Project/main/fake_job_postings.csv'

# read data
data = pd.read_csv(url)
df = data.copy()
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


### Handling Null Values

EDA plz do refer other scripts

Will dive straight into handling the null values and outliers

In [8]:
# keep track of what columns to keep and drop
cols_to_keep = []
cols_to_drop = []

In [9]:
# check for NA values
df.isna().any()

job_id                 False
title                  False
location                True
department              True
salary_range            True
company_profile         True
description             True
requirements            True
benefits                True
telecommuting          False
has_company_logo       False
has_questions          False
employment_type         True
required_experience     True
required_education      True
industry                True
function                True
fraudulent             False
dtype: bool

In [10]:
# check for num of na values
df.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [11]:
# handle location and description first since easier

# handle location - split into nation and city and fillna with unknown
df['location'] = df['location'].fillna('NA, Unknown')
df['country'] = df['location'].apply(lambda x : x.strip()[:2])
df['city'] = df['location'].apply(lambda x : x.split(',')[-1])

# a lot of sparse values noted for countries, may lead to unexpected results - to handle - keep countries with counts >10 only and put unknown for the rest
ls_of_countries = [country if df[df['country']==country]['country'].count() >= 10 else 'NA' for country in df['country'].unique()]
df['country'] = df['country'].apply(lambda x : x if x in ls_of_countries else 'NA')

ls_of_cities = [city if df[df['city']==city]['city'].count() >= 10 else 'Unknown' for city in df['city'].unique()]
df['city'] = df['city'].apply(lambda x : x if x in ls_of_cities else 'Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('country')
cols_to_keep.append('city')
cols_to_drop.append('location')

In [12]:
# handle description - convert to binary - with or without
df['has_description'] = df['description'].apply(lambda x: 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_description')
cols_to_drop.append('description')

In [13]:
# handle department - q a lot of departments - keep only those with high count and take everything else as unknown
ls_of_dept = [dept if df[df['department']==dept]['department'].count() >= 10 else 'Unknown' for dept in df['department'].unique()]
df['has_department'] = df['department'].apply(lambda x : x if x in ls_of_dept else 'Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_department')
cols_to_drop.append('department')

In [14]:
# handle salary_range - q a lot of ranges - keep as binary - has_salary or not
df['has_salary'] = df['salary_range'].apply(lambda x : 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_salary')
cols_to_drop.append('salary_range')

In [15]:
cols_to_drop # handled these colummns

['location', 'description', 'department', 'salary_range']

In [16]:
# handle company_profile, requirements and benefits next - for simplicity - keep as binary - has or not
df['has_company_profile'] = df['company_profile'].apply(lambda x : 0 if pd.isna(x) else 1)
df['has_requirements'] = df['requirements'].apply(lambda x : 0 if pd.isna(x) else 1)
df['has_benefits'] = df['benefits'].apply(lambda x : 0 if pd.isna(x) else 1)

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('has_company_profile')
cols_to_keep.append('has_requirements')
cols_to_keep.append('has_benefits')
cols_to_drop.append('company_profile')
cols_to_drop.append('requirements')
cols_to_drop.append('benefits')

In [17]:
# handle employment_type - keep na values as unknown
df['employment_type'] = df['employment_type'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('employment_type')

In [18]:
df['required_experience'].unique()

array(['Internship', 'Not Applicable', nan, 'Mid-Senior level',
       'Associate', 'Entry level', 'Executive', 'Director'], dtype=object)

In [19]:
# handle required_experience - keep na values as unknown
df['required_experience'] = df['required_experience'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('required_experience')

In [20]:
df['required_education'].unique()

array([nan, "Bachelor's Degree", "Master's Degree",
       'High School or equivalent', 'Unspecified',
       'Some College Coursework Completed', 'Vocational', 'Certification',
       'Associate Degree', 'Professional', 'Doctorate',
       'Some High School Coursework', 'Vocational - Degree',
       'Vocational - HS Diploma'], dtype=object)

In [21]:
# handle required_education - keep na values as unknown
df['required_education'] = df['required_education'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('required_education')

In [22]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education']

In [23]:
# handle industry - keep the industries - fillna with unknown
df['industry'] = df['industry'].fillna('Unknown')

# a lot of sparse values noted for industries, may lead to unexpected results - to handle - keep countries with counts >10 only and put unknown for the rest
ls_of_industries = [industry if df[df['industry']==industry]['industry'].count() >= 10 else 'NA' for industry in df['industry'].unique()]
df['industry'] = df['industry'].apply(lambda x : x if x in ls_of_industries else 'Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('industry')

In [24]:
# handle function - keep the functions - fillna with unknown
df['function'] = df['function'].fillna('Unknown')

# keep track in cols_to_keep and cols_to_drop
cols_to_keep.append('function')

In [25]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function']

In [26]:
cols_to_drop

['location',
 'description',
 'department',
 'salary_range',
 'company_profile',
 'requirements',
 'benefits']

## Next we look into the non null columns to extract what we wanna keep

In [27]:
# drop the cols inside cols_to_drop as it is not useful anymore
df = df.drop(cols_to_drop, axis=1)

In [28]:
cols_to_drop = []

In [29]:
df.isna().sum()

job_id                 0
title                  0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
city                   0
has_description        0
has_department         0
has_salary             0
has_company_profile    0
has_requirements       0
has_benefits           0
dtype: int64

In [30]:
# handle job_id - it is unique for all - so not much value - remove the col
df = df.drop('job_id', axis=1)

In [31]:
# handle title - a bit too many distinct roles - may lead to overfitting issues - would remove the cols
df = df.drop('title', axis=1)

In [32]:
# handle telecommuting, has_company_logo, has_questions - all are binary - keep as features
# keep track in cols_to_keep
cols_to_keep.append('telecommuting')
cols_to_keep.append('has_company_logo')
cols_to_keep.append('has_questions')

And with that all the columns are processed accordingly

In [33]:
cols_to_keep

['country',
 'city',
 'has_description',
 'has_department',
 'has_salary',
 'has_company_profile',
 'has_requirements',
 'has_benefits',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function',
 'telecommuting',
 'has_company_logo',
 'has_questions']

## Text Processing - including the text information into the model

In [37]:
# import relevant packages
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...


In [38]:
# Defining a Function to clean up the text information
def text_preprocess(ds: pd.Series) -> pd.Series:
    for m in range(len(ds)):

        main_words = re.sub('[^a-zA-Z]', ' ', ds[m])                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords

        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word

        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

In [39]:
# extract text features only - title, company_profile, description, requirements, benefits
df['full_text'] = data['title'] + data['company_profile'] + data['description'] + data['requirements'] + data['benefits']
df['full_text'] = df['full_text'].fillna(' ')

# preprocess the text feature
df['full_text'] = text_preprocess(df['full_text'])
text_feature = df['full_text']

# Building a TF IDF matrix out of the text information
td = TfidfVectorizer(max_features = 1000)
text_matrix = td.fit_transform(text_feature).toarray()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds[m] = main_words
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds[m] = main_words
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds[m] = main_words
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ds[m] = main_words
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the do

In [40]:
df['full_text']

0                                                         
1        customer service cloud video production second...
2                                                         
3        account executive washington dcour passion imp...
4        bill review managerspotsource solution llc glo...
                               ...                        
17875    account director distribution vend looking awe...
17876    payroll accountantweblinc commerce platform se...
17877                                                     
17878                                                     
17879                                                     
Name: full_text, Length: 17880, dtype: object

## Train-test split and preprocessing before inputting into model

In [41]:
# seperate out the correctly formatted cols and those which still needs processing through ohe
binary_cols = ['has_description', 'has_salary', 'has_company_profile', 'has_requirements', 'has_benefits', 'telecommuting', 'has_company_logo', 'has_questions']
ohe_needed_cols = []

for cols in cols_to_keep:
  if cols not in binary_cols:
    ohe_needed_cols.append(cols)

In [42]:
# select the fraudulent column as target, rest as features
features = df.drop('fraudulent', axis=1)
target_var = df['fraudulent']

In [43]:
# drop full_text
features = features.drop('full_text', axis=1)

In [44]:
# do ohe for ohe_needed_cols
features_encoded = pd.get_dummies(features, columns=ohe_needed_cols)

In [45]:
features_encoded # check the colummns if its in correct format

Unnamed: 0,telecommuting,has_company_logo,has_questions,has_description,has_salary,has_company_profile,has_requirements,has_benefits,country_AE,country_AT,...,function_Purchasing,function_Quality Assurance,function_Research,function_Sales,function_Science,function_Strategy/Planning,function_Supply Chain,function_Training,function_Unknown,function_Writing/Editing
0,0,1,0,1,0,1,1,0,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,1,0,1,0,1,1,1,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,1,0,1,0,1,1,0,False,False,...,False,False,False,False,False,False,False,False,True,False
3,0,1,0,1,0,1,1,1,False,False,...,False,False,False,True,False,False,False,False,False,False
4,0,1,1,1,0,1,1,1,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,1,1,1,0,1,1,1,False,False,...,False,False,False,True,False,False,False,False,False,False
17876,0,1,1,1,0,1,1,1,False,False,...,False,False,False,False,False,False,False,False,False,False
17877,0,0,0,1,0,1,1,0,False,False,...,False,False,False,False,False,False,False,False,True,False
17878,0,0,1,1,0,0,1,1,False,False,...,False,False,False,False,False,False,False,False,False,False


In [46]:
# # train test split
# from sklearn.model_selection import train_test_split

# xtrain, xtest, ytrain, ytest = train_test_split(features_encoded, target_var, random_state=0, test_size=0.2)

In [47]:
# train test split v2 - with text features included
from sklearn.model_selection import train_test_split

# concat text features and ohe encoded features
text_df = pd.DataFrame(text_matrix)
features_concated = pd.concat([features_encoded, text_df], axis=1)

# train test split
xtrain, xtest, ytrain, ytest = train_test_split(features_concated, target_var, random_state=0, test_size=0.2)

## Model training - Random Forest, XGBoost, Logistic Regression, Support vector machines

In [48]:
# model packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
from xgboost import XGBClassifier

In [49]:
# model training - Random Forest
# train model
rfc = RandomForestClassifier(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
rfc.fit(xtrain, ytrain)

# predictions
xtest.columns = xtest.columns.astype(str) # keep column name
rfc_ypred = rfc.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(ytest, rfc_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_ypred)
precision = precision_score(ytest, rfc_ypred)
recall = recall_score(ytest, rfc_ypred)
f1 = f1_score(ytest, rfc_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest Classifier:
Accuracy: 0.9823825503355704

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3423
           1       0.94      0.63      0.75       153

    accuracy                           0.98      3576
   macro avg       0.96      0.81      0.87      3576
weighted avg       0.98      0.98      0.98      3576

Confusion Matrix:
[[3417    6]
 [  57   96]]
Accuracy: 0.9823825503355704
Precision: 0.9411764705882353
Recall: 0.6274509803921569
F1 Score: 0.7529411764705882


In [50]:
# model training - logistic regression
# train model
lr = LogisticRegression(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
lr.fit(xtrain, ytrain)

# predictions
xtest.columns = xtest.columns.astype(str) # keep column name
lr_ypred = lr.predict(xtest)

# Evaluate the logistic regression model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, lr_ypred))
print("\nClassification Report:\n", classification_report(ytest, lr_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, lr_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, lr_ypred)
precision = precision_score(ytest, lr_ypred)
recall = recall_score(ytest, lr_ypred)
f1 = f1_score(ytest, lr_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
Accuracy: 0.9714765100671141

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3423
           1       0.80      0.44      0.57       153

    accuracy                           0.97      3576
   macro avg       0.89      0.72      0.78      3576
weighted avg       0.97      0.97      0.97      3576

Confusion Matrix:
[[3406   17]
 [  85   68]]
Accuracy: 0.9714765100671141
Precision: 0.8
Recall: 0.4444444444444444
F1 Score: 0.5714285714285714


In [51]:
# model training - xgboost
# Create an instance of XGBClassifier
xgb_classifier = XGBClassifier(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
xgb_classifier.fit(xtrain, ytrain)

# Make predictions on the test set
xtest.columns = xtest.columns.astype(str) # keep column name
xgb_ypred = xgb_classifier.predict(xtest)

# Evaluate the logistic regression model
print("XGBoost:")
print("Accuracy:", accuracy_score(ytest, xgb_ypred))
print("\nClassification Report:\n", classification_report(ytest, xgb_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, xgb_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, xgb_ypred)
precision = precision_score(ytest, xgb_ypred)
recall = recall_score(ytest, xgb_ypred)
f1 = f1_score(ytest, xgb_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

XGBoost:
Accuracy: 0.9812639821029083

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3423
           1       0.96      0.59      0.73       153

    accuracy                           0.98      3576
   macro avg       0.97      0.79      0.86      3576
weighted avg       0.98      0.98      0.98      3576

Confusion Matrix:
[[3419    4]
 [  63   90]]
Accuracy: 0.9812639821029083
Precision: 0.9574468085106383
Recall: 0.5882352941176471
F1 Score: 0.7287449392712552


In [None]:
# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() , 
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain_resampled, ytrain_resampled)
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain_resampled
            pred = ytrain_resampled
            title = 'Train'
            
        else :
            to_pred = xtest
            pred = ytest
            title = 'Test'
        y_pred = model.predict(to_pred)
        acc = round(accuracy_score(pred , y_pred)*100)
        f1 = round(f1_score(pred , y_pred)*100)
        prec = round(precision_score(pred , y_pred)*100)
        recall = round(recall_score(pred , y_pred)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall]).reshape(1,4) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model , title) } ,inplace=True )
pd.options.display.max_rows = 15
classification_report

## Use SMOTE for oversampling

In [52]:
target_var.value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [53]:
# looking at the target distribution, lets resample to make it around 60:40 ratio
majority_class_samples = 13591
minority_class_samples = (majority_class_samples / 0.6) * 0.4
resampling_strategy = {0: majority_class_samples, 1: minority_class_samples}

In [None]:
# !pip install imblearn # install package if the package is not available

In [56]:
from imblearn.over_sampling import SMOTENC # smotenc is used as it can handle categorical variable
from imblearn.over_sampling import SMOTE

# oversampling
smote_nc = SMOTENC(categorical_features=[x for x in range(544)], random_state=0) # [x for x in range(544)]
xtrain_resampled, ytrain_resampled = smote_nc.fit_resample(xtrain, ytrain)

# smote = SMOTE(random_state=0)
# xtrain_resampled, ytrain_resampled = smote.fit_resample(xtrain, ytrain)

In [57]:
# model training - Random Forest
# train model
rfc = RandomForestClassifier(random_state=0)
rfc.fit(xtrain_resampled, ytrain_resampled)

# predictions
rfc_ypred = rfc.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest:")
print("Accuracy:", accuracy_score(ytest, rfc_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_ypred)
precision = precision_score(ytest, rfc_ypred)
recall = recall_score(ytest, rfc_ypred)
f1 = f1_score(ytest, rfc_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Random Forest:
Accuracy: 0.9812639821029083

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3423
           1       0.81      0.73      0.77       153

    accuracy                           0.98      3576
   macro avg       0.90      0.86      0.88      3576
weighted avg       0.98      0.98      0.98      3576

Confusion Matrix:
[[3397   26]
 [  41  112]]
Accuracy: 0.9812639821029083
Precision: 0.8115942028985508
Recall: 0.7320261437908496
F1 Score: 0.7697594501718212


In [58]:
# model training - Random Forest
# train model
lr = LogisticRegression(random_state=0)
lr.fit(xtrain_resampled, ytrain_resampled)

# predictions
lr_ypred = lr.predict(xtest)

# Evaluate the Random Forest model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, lr_ypred))
print("\nClassification Report:\n", classification_report(ytest, lr_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, lr_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, lr_ypred)
precision = precision_score(ytest, lr_ypred)
recall = recall_score(ytest, lr_ypred)
f1 = f1_score(ytest, lr_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
Accuracy: 0.9460290827740492

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      3423
           1       0.43      0.78      0.55       153

    accuracy                           0.95      3576
   macro avg       0.71      0.87      0.76      3576
weighted avg       0.97      0.95      0.95      3576

Confusion Matrix:
[[3263  160]
 [  33  120]]
Accuracy: 0.9460290827740492
Precision: 0.42857142857142855
Recall: 0.7843137254901961
F1 Score: 0.5542725173210161


In [59]:
# model training - xgboost
# Create an instance of XGBClassifier
xgb_classifier = XGBClassifier(random_state=0)
xtrain.columns = xtrain.columns.astype(str) # keep column name
xgb_classifier.fit(xtrain_resampled, ytrain_resampled)

# Make predictions on the test set
xtest.columns = xtest.columns.astype(str) # keep column name
xgb_ypred = xgb_classifier.predict(xtest)

# Evaluate the xgboost model
print("XGBoost:")
print("Accuracy:", accuracy_score(ytest, xgb_ypred))
print("\nClassification Report:\n", classification_report(ytest, xgb_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, xgb_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, xgb_ypred)
precision = precision_score(ytest, xgb_ypred)
recall = recall_score(ytest, xgb_ypred)
f1 = f1_score(ytest, xgb_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

XGBoost:
Accuracy: 0.9714765100671141

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99      3423
           1       0.65      0.74      0.69       153

    accuracy                           0.97      3576
   macro avg       0.82      0.86      0.84      3576
weighted avg       0.97      0.97      0.97      3576

Confusion Matrix:
[[3361   62]
 [  40  113]]
Accuracy: 0.9714765100671141
Precision: 0.6457142857142857
Recall: 0.738562091503268
F1 Score: 0.6890243902439025


In [None]:
# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() , 
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain_resampled, ytrain_resampled)
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain_resampled
            pred = ytrain_resampled
            title = 'Train'
            
        else :
            to_pred = xtest
            pred = ytest
            title = 'Test'
        y_pred = model.predict(to_pred)
        acc = round(accuracy_score(pred , y_pred)*100)
        f1 = round(f1_score(pred , y_pred)*100)
        prec = round(precision_score(pred , y_pred)*100)
        recall = round(recall_score(pred , y_pred)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall]).reshape(1,4) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model , title) } ,inplace=True )
pd.options.display.max_rows = 15
classification_report

## Search for best params using gridsearch

In [None]:
# for random forest - on the original data
rfc_grid = RandomForestClassifier()

# Define the grid of parameters to search through
param_grid = {
    'n_estimators': [100, 300, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]     # Minimum number of samples required at each leaf node
}

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=rfc_grid, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search on your data
grid_search.fit(xtrain_resampled, ytrain_resampled)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
# Extracting the best model from the grid search
best_rf_model = grid_search.best_estimator_

# predictions
rfc_grid_ypred = best_rf_model.predict(xtest)

# Evaluate the Random Forest model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(ytest, rfc_grid_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_grid_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_grid_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_grid_ypred)
precision = precision_score(ytest, rfc_grid_ypred)
recall = recall_score(ytest, rfc_grid_ypred)
f1 = f1_score(ytest, rfc_grid_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Create the Logistic Regression classifier
lr_grid = LogisticRegression()

# Define the grid of parameters to search through
param_grid = {
    'C': [0.001, 0.1, 1, 100],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Penalty (L1 or L2 regularization)
    'solver': ['liblinear', 'saga', 'newton-cholesky']  # Algorithm to use in the optimization problem
}

# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=lr_grid, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform the grid search on your training data
grid_search.fit(xtrain, ytrain)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
# Use the best model to predict on the test set
best_logreg_model = grid_search.best_estimator_

# predictions
lr_grid_ypred = best_logreg_model.predict(xtest)

# Evaluate the Logistic Regression model
print("Logistic Regression:")
print("Accuracy:", accuracy_score(ytest, lr_grid_ypred))
print("\nClassification Report:\n", classification_report(ytest, lr_grid_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, lr_grid_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, lr_grid_ypred)
precision = precision_score(ytest, lr_grid_ypred)
recall = recall_score(ytest, lr_grid_ypred)
f1 = f1_score(ytest, lr_grid_ypred)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)