In [49]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score

In [27]:
df = pd.read_csv('./data/fake_job_postings.csv')

In [28]:
# Drop columns that are not useful

df = df.drop(['job_id', 'department', 'salary_range'], axis=1)

In [29]:
# Standarization categorical nan values use mode

for column in df.select_dtypes(include=['object']).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


In [30]:
# Encoding categorical columns

categorical_columns = ['title', 'location', 'company_profile', 'description', 'requirements', 'benefits', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']

X = df.drop('fraudulent', axis=1)
y = df['fraudulent']

encoder = OneHotEncoder()

X_encoded = encoder.fit_transform(X)

In [31]:
# Splitting data training and testing

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [42]:
modelRFC = RandomForestClassifier(n_estimators=100, random_state=42)

modelRFC.fit(X_train, y_train)

y_predRFC = modelRFC.predict(X_test)

In [50]:
accuracyRFC = accuracy_score(y_test, y_predRFC)
f1_scoreRFC = f1_score(y_test, y_predRFC)

## Gradient Boosting

In [43]:
modelGBC = GradientBoostingClassifier(n_estimators=100, random_state=42)

modelGBC.fit(X_train, y_train)

y_predGBC = modelGBC.predict(X_test)

In [51]:
accuracyGBC = accuracy_score(y_test, y_predGBC)
f1_scoreGBC = f1_score(y_test, y_predGBC)

## Stochastic Gradient Descent

In [44]:
modelSGD = SGDClassifier(random_state=42)

modelSGD.fit(X_train, y_train)

y_predSGD = modelSGD.predict(X_test)

In [52]:
accuracySGD = accuracy_score(y_test, y_predSGD)
f1_scoreSGD = f1_score(y_test, y_predSGD)

## Perbandingan ketiga algoritma

In [53]:
print("Akurasi RFC: ", accuracyRFC)
print("Akurasi GBC: ", accuracyGBC)
print("Akurasi SGD: ", accuracySGD)

print("F1 Score RFC: ", f1_scoreRFC)
print("F1 Score GBC: ", f1_scoreGBC)
print("F1 Score SGD: ", f1_scoreSGD)

Akurasi RFC:  0.9787472035794184
Akurasi GBC:  0.979586129753915
Akurasi SGD:  0.9798657718120806
F1 Score RFC:  0.7361111111111112
F1 Score GBC:  0.7491408934707904
F1 Score SGD:  0.7583892617449665
