In [102]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

dtf = pd.read_csv("./survey_results_minimal.csv",index_col='ResponseId', encoding='utf-8')

filtered_dtf = dtf.copy()


filtered_dtf.loc[:, 'YearsCodePro'] = pd.to_numeric(filtered_dtf['YearsCodePro'], errors='coerce')
filtered_dtf.loc[:, 'YearsCode'] = pd.to_numeric(filtered_dtf['YearsCode'], errors='coerce')

categorical_columns = ['MainBranch','Age', 'RemoteWork', 'EdLevel', 'DevType', 'OrgSize', 'Country', 'Industry','ICorPM']
categorical_dtf =filtered_dtf.copy()

label_encoder_dict = {}


for col in categorical_columns:
    label_encoder = LabelEncoder()  # Create a new LabelEncoder for each column
    categorical_dtf[col] = label_encoder.fit_transform(categorical_dtf[col].astype(str))
    label_encoder_dict[col] = label_encoder  # Store the encoder for future use

x=1.5
Q1 = categorical_dtf['ConvertedCompYearly'].quantile(0.25)
Q3 = categorical_dtf['ConvertedCompYearly'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + x * IQR

categorical_dtf = categorical_dtf[(categorical_dtf['ConvertedCompYearly'] <= upper_bound)]

categorical_dtf = categorical_dtf.astype(float).fillna(0)
categorical_dtf.to_csv('categorical_dtf.csv', index=True, header=True)

# Define features (X) and target (y) for the top 10 countries
X_top = categorical_dtf.drop(columns=['ConvertedCompYearly'])  # Features
y_top = categorical_dtf['ConvertedCompYearly']  # Target

# Define the parameter distribution
param_dist = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': np.linspace(0.01, 0.3, 10),  # Spread learning rates
    'max_depth': [3, 5, 7, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': np.linspace(0.7, 1.0, 4),        # More granular subsample rates
    'colsample_bytree': np.linspace(0.7, 1.0, 4),
}

# Initialize the model
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=50,                # Test 50 random combinations
    scoring='r2',             # Optimize for R-squared
    cv=3,                     # 3-fold cross-validation
    verbose=1,
    n_jobs=-1,                # Use all cores
    random_state=42           # Ensure reproducibility
)

# Perform the random search
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top, y_top, test_size=0.2, random_state=42)
random_search.fit(X_train_top, y_train_top)

# Best parameters and performance
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best R-squared Score from CV: {best_score:.4f}")

# Train the model with best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train_top, y_train_top)

# Evaluate on the test set
y_pred_best = best_model.predict(X_test_top)

mse_best = mean_squared_error(y_test_top, y_pred_best)
mae_best = mean_absolute_error(y_test_top, y_pred_best)
r2_best = r2_score(y_test_top, y_pred_best)

print(f"Mean Squared Error (MSE) with Best Model: {mse_best:.4f}")
print(f"Mean Absolute Error (MAE) with Best Model: {mae_best:.4f}")
print(f"R-squared (R2) with Best Model: {r2_best:.4f}") 

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'subsample': 0.9, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.10666666666666666, 'colsample_bytree': 0.7999999999999999}
Best R-squared Score from CV: 0.6494
Mean Squared Error (MSE) with Best Model: 868293910.0715
Mean Absolute Error (MAE) with Best Model: 21278.6007
R-squared (R2) with Best Model: 0.6580


In [2]:
def recommend_skills_individual_with_best_value(user_profile, model, X_columns, X_data, label_encoder_dict, top_n=5):
    """
    Generate automatic top N skill recommendations with estimated salary impact for an individual,
    by testing all possible values for each feature and determining which gives the best salary prediction.
    
    user_profile: A pandas Series or numpy array containing the individual's features (skills, experience, etc.)
    model: The trained model (e.g., XGBRegressor)
    X_columns: The list of feature column names
    X_data: The original feature data (for getting the unique values of each feature)
    label_encoder_dict: Dictionary of LabelEncoders for categorical features
    top_n: The number of recommendations to provide
    
    Returns the current salary and a list of skill improvement recommendations with salary impact.
    """
    # Predict the individual's current salary
    current_salary = model.predict(user_profile.reshape(1, -1))[0]
    
    recommendations = []
    
    # Iterate through all features to calculate the salary impact of improving each feature
    for i, feature in enumerate(X_columns):
        feature_value = user_profile[i]
        
        # Get all unique values for this feature from the data
        unique_values = np.unique(X_data[feature])
        
        # Initialize variables to track the best result for this feature
        best_value = feature_value
        best_salary = current_salary
        best_salary_increase = 0
        
        # Simulate changing this feature to each possible value
        for value in unique_values:
            # Create a copy of the user profile to simulate the change in this feature
            user_profile_copy = user_profile.copy()
            
            # Set the feature value to the candidate value
            user_profile_copy[i] = value
            
            # Predict the new salary after the change
            improved_salary = model.predict(user_profile_copy.reshape(1, -1))[0]
            
            # Calculate the salary increase
            salary_increase = improved_salary - current_salary
            
            # If this value gives a better salary, update the best value
            if salary_increase > best_salary_increase:
                best_value = value
                best_salary = improved_salary
                best_salary_increase = salary_increase
        
        # If the feature is categorical and has a label encoder, decode the value back to its original label
        if feature in label_encoder_dict:
            encoder = label_encoder_dict[feature]
            # Convert best_value to integer for inverse_transform
            best_value_int = int(best_value)
            best_value_str = encoder.inverse_transform([best_value_int])[0]
        else:
            best_value_str = best_value
        
        # Generate the recommendation
        recommendation = f"Change your {feature} to {best_value_str} to increase your salary by approximately ${best_salary_increase:.2f}."
        recommendations.append((best_value_str, best_salary_increase, recommendation))
    
    # Sort the recommendations by the salary increase, descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    # Return the current salary and top N recommendations
    top_recommendations = [recommendation[2] for recommendation in recommendations[:top_n]]
    return current_salary, top_recommendations


In [103]:
user_profile = X_top.iloc[0].values  # Get the first individual's data from the test set

current_salary, skills_recommendations = recommend_skills_individual_with_best_value(
    user_profile, best_model, X_top.columns, X_top, label_encoder_dict, top_n=1000)
print(f"user_profile: {user_profile}")


user_profile: [  0.   0.   0.   5.   3.   1.   5.   3. 112.   1.   3.  13.  10.   1.
   0.   1.   1.   1.   1.   1.   1.   1.   1.   1.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   1.   1.   1.   1.   0.   0.   0.   0.]


In [104]:
import joblib

# Save the trained model to a file
joblib.dump(best_model, 'model.pkl')
joblib.dump(label_encoder_dict, 'label_encoders.pkl')


['label_encoders.pkl']