In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Load the dataset
file_path = 'fake_job_postings.csv'
df = pd.read_csv(file_path)

# Step 2: Preprocess the data
# Assuming 'fraudulent' is the target variable
X = df.drop(columns=['fraudulent'])  # Features
y = df['fraudulent']  # Target variable

# Encode categorical variables (if any)
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    X[column] = label_encoders[column].fit_transform(X[column])

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Apply SMOTE to handle class imbalance on the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 5: Initialize and train the Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate the model
y_pred = rfc.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3395
           1       0.89      0.86      0.87       181

    accuracy                           0.99      3576
   macro avg       0.94      0.93      0.93      3576
weighted avg       0.99      0.99      0.99      3576

