In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import matplotlib.pyplot as plt

: 

In [None]:
df = pd.read_csv('jobstreet_jobs_cleaned_with_category.csv')

df.describe()
df.info()

plt.figure(figsize=(10, 6))
df['Gaji Min'].hist(bins=50)
plt.title('Gaji Min Distribution')
plt.xlabel('Gaji Min')

plt.figure(figsize=(10, 6))
df['Gaji Max'].hist(bins=50)
plt.title('Gaji Max Distribution')
plt.xlabel('Gaji Max')

In [None]:
df['Gaji_Min'] = pd.to_numeric(df['Gaji Min'], errors='coerce')
df['Gaji_Max'] = pd.to_numeric(df['Gaji Max'], errors='coerce')
df['Gaji_Rata2'] = df[['Gaji_Min', 'Gaji_Max']].mean(axis=1)

# Select relevant features and target
data = df[['Kategori Lowongan', 'Umur', 'Pendidikan', 'Title', 'Link', 'Gaji_Rata2']]

# Drop rows where target is missing
data = data.dropna(subset=['Gaji_Rata2'])

# Impute missing values for 'Umur' with median
imputer_umur = SimpleImputer(strategy='median')
data['Umur'] = imputer_umur.fit_transform(data[['Umur']]).ravel()

imputer_pend = SimpleImputer(strategy='most_frequent')
data['Pendidikan'] = imputer_pend.fit_transform(data[['Pendidikan']]).ravel()

le_kategori = LabelEncoder()
data['Kategori_Encoded'] = le_kategori.fit_transform(data['Kategori Lowongan'])

le_pendidikan = LabelEncoder()
data['Pendidikan_Encoded'] = le_pendidikan.fit_transform(data['Pendidikan'])

X = data[['Kategori_Encoded', 'Umur', 'Pendidikan_Encoded']]
y = data['Gaji_Rata2']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))

# XGBoost
import xgboost as xgb
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))

# Tampilkan hasil
print("Random Forest:    R2 = {:.2f}, RMSE = {:.2f}".format(rf_r2, rf_rmse))
print("Linear Regression: R2 = {:.2f}, RMSE = {:.2f}".format(lr_r2, lr_rmse))
print("XGBoost:          R2 = {:.2f}, RMSE = {:.2f}".format(xgb_r2, xgb_rmse))

model_scores = {
    rf: rf_r2,
    lr: lr_r2,
    xgb_model: xgb_r2
}

best_model = max(model_scores, key=model_scores.get)

print(f"Model terbaik: {type(best_model).__name__}")

joblib.dump(best_model, 'gaji_prediksi_model.joblib')



In [None]:
joblib.dump(le_kategori, 'label_encoder_kategori.joblib')
joblib.dump(le_pendidikan, 'label_encoder_pendidikan.joblib')
joblib.dump(imputer_umur, 'imputer_umur.joblib')
joblib.dump(imputer_pend, 'imputer_pendidikan.joblib')

In [None]:
data.to_csv('job_data_for_streamlit.csv', index=False)