In [72]:
import pickle
import pandas as pd

In [73]:
MODEL_FILE = "salary_model.pkl"

def load_model(path: str = None):
    with open(path, "rb") as f:
        model = pickle.load(f)

    return model

model_package = load_model(MODEL_FILE)


In [75]:
model = model_package['model']
dv = model_package['dict_vectorizer']
# numerical_features = model_package['numberical_features']   

In [76]:
df = pd.read_csv('salary_data.csv')

In [None]:
# Create X dataset for prediction
import numpy as np

# Normalize dataframe to match training preprocessing
df_norm = df.copy()
df_norm.columns = df_norm.columns.str.lower().str.replace(' ', '_')
for col in df_norm.select_dtypes(include=['object']).columns:
    df_norm[col] = df_norm[col].str.lower().str.replace(' ', '_')

# Extract required features from DictVectorizer
dv_feature_names = dv.get_feature_names_out()
all_training_cols = sorted({f.split('=')[0] for f in dv_feature_names})

# Check for missing columns
missing_cols = [c for c in all_training_cols if c not in df_norm.columns]
if missing_cols:
    print(f"⚠️  WARNING: Missing columns: {missing_cols}")

# Select columns used in training
df_features = df_norm[all_training_cols].copy()

# Handle missing values
for col in df_features.columns:
    if df_features[col].isnull().sum() > 0:
        if df_features[col].dtype == 'object':
            df_features[col] = df_features[col].fillna('missing')
        else:
            df_features[col] = df_features[col].fillna(0)

# Convert to dictionary format for DictVectorizer
feature_dicts = df_features.to_dict('records')

# Transform using DictVectorizer
X = dv.transform(feature_dicts)

# Verify feature count matches model
expected_total = model.n_features_in_
if X.shape[1] != expected_total:
    print(f"⚠️  Feature mismatch! Expected: {expected_total}, Got: {X.shape[1]}")

# Make predictions
try:
    predictions = model.predict(X)
    
    print(f"Predictions successful!")
    print(f"  Min: ${predictions.min():.2f}k")
    print(f"  Max: ${predictions.max():.2f}k")
    print(f"  Mean: ${predictions.mean():.2f}k")
    print(f"  Median: ${np.median(predictions):.2f}k")
    
    # Add predictions to original dataframe
    df['predicted_salary'] = predictions
    
    # Save results
    output_file = 'salary_predictions.csv'
    df.to_csv(output_file, index=False)
    print(f"\n✓ Results saved to: {output_file}")
    
    # Show sample comparison if actual salary available
    if 'avg_salary' in df.columns:
        print(f"\nSample predictions (first 5 rows):")
        comparison = pd.DataFrame({
            'Actual': df['avg_salary'].head(),
            'Predicted': predictions[:5],
            'Error': df['avg_salary'].head() - predictions[:5]
        })
        print(comparison.to_string(index=False))
    
except Exception as e:
    print(f"✗ Prediction failed: {str(e)}")

# For downstream single-row predictions
Y = df.get('avg_salary')

PREDICTION PIPELINE - Using DictVectorizer for ALL features

1. Normalizing dataframe (lowercase, underscores)...
   ✓ Normalized 33 columns

2. DictVectorizer info:
   Total encoded features: 114
   Numerical features: ['python_yn', 'spark', 'aws', 'num_comp', 'desc_len', 'employer_provided', 'excel', 'hourly']

3. Required columns from training (count=15):
   ['aws', 'desc_len', 'employer_provided', 'excel', 'hourly', 'job_simp', 'job_state', 'num_comp', 'python_yn', 'revenue', 'sector', 'seniority', 'size', 'spark', 'type_of_ownership']

   ✓ All required columns present in dataframe

4. Handling missing values...

5. Converting to dictionary format...
   ✓ Created 742 dictionaries
   Sample dict keys: ['aws', 'desc_len', 'employer_provided', 'excel', 'hourly', 'job_simp', 'job_state', 'num_comp', 'python_yn', 'revenue', 'sector', 'seniority', 'size', 'spark', 'type_of_ownership']

6. Applying DictVectorizer transformation...
   ✓ Transformed shape: (742, 114)
   Total features: 114

In [None]:
# Single sample prediction test
sample_index = 192

X_single = X[sample_index:sample_index+1]
y_actual = Y.iloc[sample_index] if Y is not None else None

y_prediction = model.predict(X_single)[0]

print(f"Sample #{sample_index}:")
print(f"  Predicted Salary: ${y_prediction:.2f}k")

if y_actual is not None:
    error_dollars = y_actual - y_prediction
    error_percent = (error_dollars / y_actual) * 100
    
    print(f"  Actual:    ${y_actual:.2f}k")
    print(f"  Error:     ${error_dollars:.2f}k ({error_percent:+.2f}%)")
    
    if abs(error_percent) < 10:
        quality = "EXCELLENT"
    elif abs(error_percent) < 20:
        quality = "GOOD"
    elif abs(error_percent) < 30:
        quality = "ACCEPTABLE"
    else:
        quality = "POOR"
    
    print(f"  Quality:   {quality}")
else:
    print("  (No actual salary available for comparison)")

Testing single sample prediction for row #192...

✓ Prediction Made:
  Predicted Salary: $131.78k

✓ Comparison:
  Actual:    $134.50k
  Predicted: $131.78k
  Error:     $2.72k (+2.03%)
  Quality:   ✓ EXCELLENT (<10% error)


In [103]:
customer_data = {'python_yn': 1,
 'spark': 0,
 'aws': 1,
 'num_comp': 3,
 'desc_len': 3747,
 'employer_provided': 0,
 'excel': 1,
 'hourly': 0,
 'seniority': 'jr',
 'job_state': 'tx',
 'type_of_ownership': 'company_-_public',
 'sector': 'real_estate',
 'job_simp': 'data_scientist',
 'revenue': '$1_to_$2_billion_(usd)',
 'size': '201_to_500_employees'}

In [104]:
dv, model

(DictVectorizer(dtype=<class 'int'>, sparse=False),
 RandomForestRegressor(max_depth=15, n_estimators=200, n_jobs=-1,
                       random_state=42))

In [105]:
customer_x = dv.transform([customer_data])

In [106]:
predictions = model.predict(customer_x)
predictions

array([112.82678698])