**Import Library**

In [2]:
import numpy as np 
import pandas as pd
from pymongo import MongoClient
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ---Memuat DataFrame dari CSV---
df = pd.read_csv(r"C:\Users\Pongo\Project\balanced_data.csv")
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,27171,Analytics,region_22,Bachelor's,m,other,3,28,3,5,0,0,85,1
1,72847,Technology,region_2,Master's & above,f,other,2,45,3,7,0,0,81,0
2,48108,Procurement,region_2,Bachelor's,f,sourcing,1,58,4,25,1,0,67,1
3,57102,Technology,region_13,Bachelor's,f,other,1,22,3,3,1,0,83,1
4,170,Sales & Marketing,region_31,Bachelor's,m,other,1,29,3,6,1,0,94,1


**Analysis Data**

In [4]:
# Shape
print("Jumlah baris dan kolom:", df.shape)

# Tipe data per kolom
print("\nTipe data:")
print(df.dtypes)

# Statistik deskriptif
print("\nStatistik deskriptif:")
print(df.describe(include='all'))

#Cek duplikasi
print("\nJumlah duplikat:", df.duplicated().sum())

Jumlah baris dan kolom: (8464, 14)

Tipe data:
employee_id              int64
department              object
region                  object
education               object
gender                  object
recruitment_channel     object
no_of_trainings          int64
age                      int64
previous_year_rating     int64
length_of_service        int64
KPIs_met >80%            int64
awards_won?              int64
avg_training_score       int64
is_promoted              int64
dtype: object

Statistik deskriptif:
         employee_id         department    region   education gender  \
count    8464.000000               8464      8464        8464   8464   
unique           NaN                  9        34           3      2   
top              NaN  Sales & Marketing  region_2  Bachelor's      m   
freq             NaN               2323      1826        5647   5833   
mean    39396.520912                NaN       NaN         NaN    NaN   
std     22628.826286                NaN       NaN 

In [5]:
len(df)

8464

In [7]:

print("\nMissing values:")
print(df.isnull().sum())


Missing values:
employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64


In [8]:
df = df.dropna()
len(df)

8464

In [9]:
# ---Mengecek Missing Values---
print("\nMissing values:")
print(df.isnull().sum())


Missing values:
employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64


In [10]:
# ---Mengubah Tipe Data---
df['previous_year_rating'] = df['previous_year_rating'].astype(np.int64)

In [11]:
# ---Menghapus Duplikasi---
df.drop_duplicates(inplace=True)
print("\nJumlah duplikat:", df.duplicated().sum())


Jumlah duplikat: 0


In [12]:
len(df)

8464

In [13]:
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,27171,Analytics,region_22,Bachelor's,m,other,3,28,3,5,0,0,85,1
1,72847,Technology,region_2,Master's & above,f,other,2,45,3,7,0,0,81,0
2,48108,Procurement,region_2,Bachelor's,f,sourcing,1,58,4,25,1,0,67,1
3,57102,Technology,region_13,Bachelor's,f,other,1,22,3,3,1,0,83,1
4,170,Sales & Marketing,region_31,Bachelor's,m,other,1,29,3,6,1,0,94,1


In [14]:
# ---Menghapus Kolom yang Tidak Diperlukan---
df.drop(["employee_id", "region"], axis=1, inplace=True)

In [15]:
# ---Mengubah String menjadi Kategori---
df.department = df.department.map({"Sales & Marketing":0,
                 "Operations":1,
                 "Technology":2,
                 "Analytics":3,
                 "R&D":4,
                 "Procurement":5,
                 "Finance":6,
                 "HR":7,
                 "Legal":8})
df.education = df.education.map({"Master's & above":0,
                                "Bachelor's":1,
                                "Below Secondary":2})
df.gender = df.gender.map({"f":0,
                          "m":1})
df.recruitment_channel = df.recruitment_channel.map({"sourcing":0,
                                                    "other":1,
                                                    "referred":2})
df.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,3,1,1,1,3,28,3,5,0,0,85,1
1,2,0,0,1,2,45,3,7,0,0,81,0
2,5,1,0,0,1,58,4,25,1,0,67,1
3,2,1,0,1,1,22,3,3,1,0,83,1
4,0,1,1,1,1,29,3,6,1,0,94,1


In [16]:
# ---Menyeimbangkan Data---
promoted_df = df[df['is_promoted'] == 1]
not_promoted_df = df[df['is_promoted'] == 0]

# Undersampling data tidak dipromosi sebanyak jumlah data dipromosi
not_promoted_sampled = not_promoted_df.sample(n=len(promoted_df), random_state=42)

# Gabungkan jadi balanced dataset
balanced_df = pd.concat([promoted_df, not_promoted_sampled])

In [17]:
# ---Menyimpan Data ke MongoDB---
client = MongoClient("mongodb://localhost:27017/")
db = client["staff_db"]
collection = db["cleaned_data"]

data_dict = balanced_df.to_dict("records")
collection.insert_many(data_dict)

InsertManyResult([ObjectId('68116d835da60983685a6fa4'), ObjectId('68116d835da60983685a6fa5'), ObjectId('68116d835da60983685a6fa6'), ObjectId('68116d835da60983685a6fa7'), ObjectId('68116d835da60983685a6fa8'), ObjectId('68116d835da60983685a6fa9'), ObjectId('68116d835da60983685a6faa'), ObjectId('68116d835da60983685a6fab'), ObjectId('68116d835da60983685a6fac'), ObjectId('68116d835da60983685a6fad'), ObjectId('68116d835da60983685a6fae'), ObjectId('68116d835da60983685a6faf'), ObjectId('68116d835da60983685a6fb0'), ObjectId('68116d835da60983685a6fb1'), ObjectId('68116d835da60983685a6fb2'), ObjectId('68116d835da60983685a6fb3'), ObjectId('68116d835da60983685a6fb4'), ObjectId('68116d835da60983685a6fb5'), ObjectId('68116d835da60983685a6fb6'), ObjectId('68116d835da60983685a6fb7'), ObjectId('68116d835da60983685a6fb8'), ObjectId('68116d835da60983685a6fb9'), ObjectId('68116d835da60983685a6fba'), ObjectId('68116d835da60983685a6fbb'), ObjectId('68116d835da60983685a6fbc'), ObjectId('68116d835da60983685a6f