In [6]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

# Set a custom dark theme resembling your slides
pio.templates["custom_dark"] = go.layout.Template(
    layout=go.Layout(
        font=dict(family="Montserrat, Arial, sans-serif", color="#F8F8FF"),
        paper_bgcolor="#16181d",  # deep black/blue
        plot_bgcolor="rgba(10,18,30,0.95)",
        title=dict(font=dict(size=24, color="#F8F8FF")),
        xaxis=dict(gridcolor="#22232a", zerolinecolor="#444", linecolor="#444"),
        yaxis=dict(gridcolor="#22232a", zerolinecolor="#444", linecolor="#444"),
        margin=dict(l=60, r=30, t=60, b=60)
    )
)
pio.templates.default = "custom_dark"

# Load and clean data
df = pd.read_csv('../data/survey_results_public.csv')
df = df[['Country', 'YearsCodePro', 'ConvertedCompYearly']]
df = df.dropna(subset=['Country', 'YearsCodePro', 'ConvertedCompYearly'])
df = df[df['ConvertedCompYearly'] > 0]
df['YearsCodePro'] = df['YearsCodePro'].replace({'Less than 1 year': 0, 'More than 50 years': 51})
df['YearsCodePro'] = pd.to_numeric(df['YearsCodePro'], errors='coerce')
df = df.dropna(subset=['YearsCodePro'])

# Median Salary by Country (Top 10)
country_median = df.groupby('Country')['ConvertedCompYearly'].median().sort_values(ascending=False).head(10)
fig1 = go.Figure(go.Bar(
    x=country_median.index,
    y=country_median.values,
    marker=dict(
        color=country_median.values,
        colorscale='Viridis',
        line=dict(color="#2EC7FF", width=2)
    )
))
fig1.update_layout(
    title="Median Salary by Country (Top 10)",
    xaxis_title="Country",
    yaxis_title="Median Salary (USD)",
    template="custom_dark"
)
fig1.show()

# Median Salary vs. Years of Experience
exp_median = df.groupby('YearsCodePro')['ConvertedCompYearly'].median()
fig2 = go.Figure(go.Scatter(
    x=exp_median.index,
    y=exp_median.values,
    mode='lines+markers',
    line=dict(color="#2EC7FF", width=3),
    marker=dict(size=7, color="#F8F8FF", line=dict(width=2, color="#2EC7FF"))
))
fig2.update_layout(
    title="Median Salary vs. Years of Professional Coding Experience",
    xaxis_title="Years of Experience",
    yaxis_title="Median Salary (USD)",
    template="custom_dark"
)
fig2.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import os

# 1. Load Data
df = pd.read_csv("../processed/cleaned_data.csv")
X = df.drop("ConvertedCompYearly", axis=1)
y = df["ConvertedCompYearly"]

# 2. Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Define Pipelines
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
])

pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])

pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42, verbosity=0))
])

# 4. Train Models
pipeline_rf.fit(X_train, y_train)
pipeline_lr.fit(X_train, y_train)
pipeline_xgb.fit(X_train, y_train)

# 5. Evaluate Models
def eval_model(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, r2

mae_rf, r2_rf = eval_model(pipeline_rf, X_test, y_test)
mae_lr, r2_lr = eval_model(pipeline_lr, X_test, y_test)
mae_xgb, r2_xgb = eval_model(pipeline_xgb, X_test, y_test)

print("\nRandom Forest: MAE = {:.2f}, R2 = {:.2f}".format(mae_rf, r2_rf))
print("Linear Regression: MAE = {:.2f}, R2 = {:.2f}".format(mae_lr, r2_lr))
print("XGBoost: MAE = {:.2f}, R2 = {:.2f}\n".format(mae_xgb, r2_xgb))

# 6. Save Pipelines
os.makedirs("models", exist_ok=True)
joblib.dump(pipeline_rf, "../models/rf_pipeline.pkl")
joblib.dump(pipeline_lr, "../models/lr_pipeline.pkl")
joblib.dump(pipeline_xgb, "../models/xgb_pipeline.pkl")

# 7. Save Feature Columns (for inference in app)
features = list(X.columns)
joblib.dump(features, "../models/features.pkl")

print("All models and feature list saved in models/ directory.")

# 8. (Optional) Save evaluation results
results = pd.DataFrame({
    'Model': ['Random Forest', 'Linear Regression', 'XGBoost'],
    'MAE': [mae_rf, mae_lr, mae_xgb],
    'R2 Score': [r2_rf, r2_lr, r2_xgb]
})
results.to_csv('..processed/model_results.csv', index=False)


Random Forest: MAE = 28925.90, R2 = 0.52
Linear Regression: MAE = 29684.40, R2 = 0.51
XGBoost: MAE = 28265.38, R2 = 0.54

All models and feature list saved in models/ directory.
