In [None]:
# salary_prediction_model.ipynb
# Install required libraries
!pip install pandas numpy scikit-learn catboost streamlit --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from catboost import CatBoostRegressor
import joblib

# Load and clean data
df = pd.read_csv('Salary_Data.csv')
df = df.dropna()
df = df[df['Salary'] > 10000]  # Remove obvious outliers

# Feature engineering
df['Years of Experience'] = df['Years of Experience'].astype(float)
df['Seniority'] = df['Job Title'].apply(lambda x: 1 if 'senior' in x.lower() else 0)
df['Management'] = df['Job Title'].apply(lambda x: 1 if any(word in x.lower() for word in ['manager', 'director', 'vp', 'head', 'chief']) else 0)

# Preprocessing
X = df[['Age', 'Gender', 'Education Level', 'Job Title', 'Years of Experience', 'Seniority', 'Management']]
y = df['Salary']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.05,
    depth=8,
    cat_features=['Gender', 'Education Level', 'Job Title'],
    verbose=0
)
model.fit(X_train, y_train)

# Evaluate model
preds = model.predict(X_test)
r2 = r2_score(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.0f}")

# Save model
joblib.dump(model, 'salary_model.pkl')
print("Model saved successfully!")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hR² Score: 0.9857
RMSE: 6239
Model saved successfully!


In [None]:
# app.py
import streamlit as st
import pandas as pd
import joblib

# Load model
model = joblib.load('salary_model.pkl')

# Title
st.title("💰 Employee Salary Predictor")
st.subheader("Predict salaries based on employee characteristics")

# Input form
with st.form("salary_form"):
    age = st.number_input("Age", min_value=18, max_value=70, value=30)
    gender = st.selectbox("Gender", ["Male", "Female"])
    education = st.selectbox("Education Level", ["Bachelor's", "Master's", "PhD"])
    job_title = st.text_input("Job Title", "Software Engineer")
    experience = st.slider("Years of Experience", 0.0, 30.0, 5.0)

    # Feature engineering
    seniority = 1 if 'senior' in job_title.lower() else 0
    management = 1 if any(word in job_title.lower() for word in ['manager', 'director', 'vp', 'head', 'chief']) else 0

    submitted = st.form_submit_button("Predict Salary")

    if submitted:
        input_data = pd.DataFrame({
            'Age': [age],
            'Gender': [gender],
            'Education Level': [education],
            'Job Title': [job_title],
            'Years of Experience': [experience],
            'Seniority': [seniority],
            'Management': [management]
        })

        prediction = model.predict(input_data)[0]
        st.success(f"Predicted Salary: ${prediction:,.2f}")

# Instructions
st.markdown("""
### Instructions
1. Fill in all employee details
2. Click 'Predict Salary'
3. Job titles should be specific (e.g. 'Senior Software Engineer')
""")

2025-07-18 09:10:26.291 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-07-18 09:10:26.316 Session state does not function when running a script without `streamlit run`


DeltaGenerator(_form_data=FormData(form_id='salary_form'))

In [None]:
# Sanity check: simulate same input in code
input_test = pd.DataFrame({
    'Age': [24],
    'Gender': ['Male'],
    'Education Level': ["Bachelor's"],
    'Job Title': ['Software Engineer'],
    'Years of Experience': [3.0],
    'Seniority': [0],
    'Management': [0]
})

print(model.predict(input_test))


[121752.46836079]
