# Machine Learning Analysis - Position Salaries Dataset

This notebook contains comprehensive machine learning analysis for predicting salaries based on position levels.

## Objectives
1. Prepare data for machine learning
2. Implement multiple regression models:
   - Linear Regression
   - Polynomial Regression (various degrees)
   - Random Forest Regression
   - Support Vector Regression (SVR)
3. Evaluate and compare models
4. Select the best model for salary prediction
5. Make predictions for new positions


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle
import warnings
from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
warnings.filterwarnings('ignore')

# Load data
project_root = Path().resolve().parent.parent.parent
data_path = project_root / "data" / "raw" / "Position_Salaries.csv"
df = pd.read_csv(data_path)

print("Data loaded successfully!")
print(f"Shape: {df.shape}")
df.head()


## 1. Data Preparation


In [None]:
# Prepare features and target
X = df[['Level']].values
y = df['Salary'].values

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Since the dataset is small, we'll use all data for training
# In a real scenario, we would split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# For visualization, we'll create a smoother range of levels
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))


## 2. Linear Regression


In [None]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X, y)

# Predictions
y_pred_lin = lin_reg.predict(X)
y_pred_lin_grid = lin_reg.predict(X_grid)

# Metrics
mse_lin = mean_squared_error(y, y_pred_lin)
rmse_lin = np.sqrt(mse_lin)
mae_lin = mean_absolute_error(y, y_pred_lin)
r2_lin = r2_score(y, y_pred_lin)

print("=" * 60)
print("LINEAR REGRESSION RESULTS")
print("=" * 60)
print(f"Mean Squared Error: {mse_lin:,.2f}")
print(f"Root Mean Squared Error: {rmse_lin:,.2f}")
print(f"Mean Absolute Error: {mae_lin:,.2f}")
print(f"R² Score: {r2_lin:.4f}")
print(f"Coefficients: {lin_reg.coef_[0]:.2f}")
print(f"Intercept: ${lin_reg.intercept_:,.2f}")

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data')
plt.plot(X_grid, y_pred_lin_grid, color='blue', linewidth=2, label='Linear Regression')
plt.title('Linear Regression - Salary Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'linear_regression.png', dpi=300, bbox_inches='tight')
plt.show()


## 3. Polynomial Regression


In [None]:
# Polynomial Regression - Degree 2
poly_reg_2 = PolynomialFeatures(degree=2)
X_poly_2 = poly_reg_2.fit_transform(X)
lin_reg_poly_2 = LinearRegression()
lin_reg_poly_2.fit(X_poly_2, y)

# Predictions
y_pred_poly_2 = lin_reg_poly_2.predict(X_poly_2)
X_grid_poly_2 = poly_reg_2.transform(X_grid)
y_pred_poly_2_grid = lin_reg_poly_2.predict(X_grid_poly_2)

# Metrics
mse_poly_2 = mean_squared_error(y, y_pred_poly_2)
rmse_poly_2 = np.sqrt(mse_poly_2)
mae_poly_2 = mean_absolute_error(y, y_pred_poly_2)
r2_poly_2 = r2_score(y, y_pred_poly_2)

print("=" * 60)
print("POLYNOMIAL REGRESSION (Degree 2) RESULTS")
print("=" * 60)
print(f"Mean Squared Error: {mse_poly_2:,.2f}")
print(f"Root Mean Squared Error: {rmse_poly_2:,.2f}")
print(f"Mean Absolute Error: {mae_poly_2:,.2f}")
print(f"R² Score: {r2_poly_2:.4f}")

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data')
plt.plot(X_grid, y_pred_poly_2_grid, color='blue', linewidth=2, label='Polynomial Regression (Degree 2)')
plt.title('Polynomial Regression (Degree 2) - Salary Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'polynomial_regression_deg2.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Polynomial Regression - Degree 3
poly_reg_3 = PolynomialFeatures(degree=3)
X_poly_3 = poly_reg_3.fit_transform(X)
lin_reg_poly_3 = LinearRegression()
lin_reg_poly_3.fit(X_poly_3, y)

# Predictions
y_pred_poly_3 = lin_reg_poly_3.predict(X_poly_3)
X_grid_poly_3 = poly_reg_3.transform(X_grid)
y_pred_poly_3_grid = lin_reg_poly_3.predict(X_grid_poly_3)

# Metrics
mse_poly_3 = mean_squared_error(y, y_pred_poly_3)
rmse_poly_3 = np.sqrt(mse_poly_3)
mae_poly_3 = mean_absolute_error(y, y_pred_poly_3)
r2_poly_3 = r2_score(y, y_pred_poly_3)

print("=" * 60)
print("POLYNOMIAL REGRESSION (Degree 3) RESULTS")
print("=" * 60)
print(f"Mean Squared Error: {mse_poly_3:,.2f}")
print(f"Root Mean Squared Error: {rmse_poly_3:,.2f}")
print(f"Mean Absolute Error: {mae_poly_3:,.2f}")
print(f"R² Score: {r2_poly_3:.4f}")

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data')
plt.plot(X_grid, y_pred_poly_3_grid, color='blue', linewidth=2, label='Polynomial Regression (Degree 3)')
plt.title('Polynomial Regression (Degree 3) - Salary Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'polynomial_regression_deg3.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Polynomial Regression - Degree 4 (Higher degree for better fit)
poly_reg_4 = PolynomialFeatures(degree=4)
X_poly_4 = poly_reg_4.fit_transform(X)
lin_reg_poly_4 = LinearRegression()
lin_reg_poly_4.fit(X_poly_4, y)

# Predictions
y_pred_poly_4 = lin_reg_poly_4.predict(X_poly_4)
X_grid_poly_4 = poly_reg_4.transform(X_grid)
y_pred_poly_4_grid = lin_reg_poly_4.predict(X_grid_poly_4)

# Metrics
mse_poly_4 = mean_squared_error(y, y_pred_poly_4)
rmse_poly_4 = np.sqrt(mse_poly_4)
mae_poly_4 = mean_absolute_error(y, y_pred_poly_4)
r2_poly_4 = r2_score(y, y_pred_poly_4)

print("=" * 60)
print("POLYNOMIAL REGRESSION (Degree 4) RESULTS")
print("=" * 60)
print(f"Mean Squared Error: {mse_poly_4:,.2f}")
print(f"Root Mean Squared Error: {rmse_poly_4:,.2f}")
print(f"Mean Absolute Error: {mae_poly_4:,.2f}")
print(f"R² Score: {r2_poly_4:.4f}")

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data')
plt.plot(X_grid, y_pred_poly_4_grid, color='blue', linewidth=2, label='Polynomial Regression (Degree 4)')
plt.title('Polynomial Regression (Degree 4) - Salary Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'polynomial_regression_deg4.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Random Forest Regression


In [None]:
# Random Forest Regression
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X, y)

# Predictions
y_pred_rf = rf_reg.predict(X)
y_pred_rf_grid = rf_reg.predict(X_grid)

# Metrics
mse_rf = mean_squared_error(y, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y, y_pred_rf)
r2_rf = r2_score(y, y_pred_rf)

print("=" * 60)
print("RANDOM FOREST REGRESSION RESULTS")
print("=" * 60)
print(f"Mean Squared Error: {mse_rf:,.2f}")
print(f"Root Mean Squared Error: {rmse_rf:,.2f}")
print(f"Mean Absolute Error: {mae_rf:,.2f}")
print(f"R² Score: {r2_rf:.4f}")

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data')
plt.plot(X_grid, y_pred_rf_grid, color='green', linewidth=2, label='Random Forest Regression')
plt.title('Random Forest Regression - Salary Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'random_forest_regression.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Support Vector Regression (SVR)


In [None]:
# Support Vector Regression (SVR)
# SVR requires feature scaling
sc_X = StandardScaler()
sc_y = StandardScaler()
X_scaled = sc_X.fit_transform(X)
y_scaled = sc_y.fit_transform(y.reshape(-1, 1)).ravel()

# Fit SVR
svr_reg = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
svr_reg.fit(X_scaled, y_scaled)

# Predictions
y_pred_svr_scaled = svr_reg.predict(X_scaled)
y_pred_svr = sc_y.inverse_transform(y_pred_svr_scaled.reshape(-1, 1)).ravel()

X_grid_scaled = sc_X.transform(X_grid)
y_pred_svr_grid_scaled = svr_reg.predict(X_grid_scaled)
y_pred_svr_grid = sc_y.inverse_transform(y_pred_svr_grid_scaled.reshape(-1, 1)).ravel()

# Metrics
mse_svr = mean_squared_error(y, y_pred_svr)
rmse_svr = np.sqrt(mse_svr)
mae_svr = mean_absolute_error(y, y_pred_svr)
r2_svr = r2_score(y, y_pred_svr)

print("=" * 60)
print("SUPPORT VECTOR REGRESSION (SVR) RESULTS")
print("=" * 60)
print(f"Mean Squared Error: {mse_svr:,.2f}")
print(f"Root Mean Squared Error: {rmse_svr:,.2f}")
print(f"Mean Absolute Error: {mae_svr:,.2f}")
print(f"R² Score: {r2_svr:.4f}")

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data')
plt.plot(X_grid, y_pred_svr_grid, color='purple', linewidth=2, label='Support Vector Regression')
plt.title('Support Vector Regression (SVR) - Salary Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'svr_regression.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Model Comparison


In [None]:
# Compare all models
models_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Polynomial (deg=2)', 'Polynomial (deg=3)', 
              'Polynomial (deg=4)', 'Random Forest', 'SVR'],
    'MSE': [mse_lin, mse_poly_2, mse_poly_3, mse_poly_4, mse_rf, mse_svr],
    'RMSE': [rmse_lin, rmse_poly_2, rmse_poly_3, rmse_poly_4, rmse_rf, rmse_svr],
    'MAE': [mae_lin, mae_poly_2, mae_poly_3, mae_poly_4, mae_rf, mae_svr],
    'R² Score': [r2_lin, r2_poly_2, r2_poly_3, r2_poly_4, r2_rf, r2_svr]
})

print("=" * 80)
print("MODEL COMPARISON")
print("=" * 80)
print(models_comparison.to_string(index=False))

# Find best model
best_model_idx = models_comparison['R² Score'].idxmax()
best_model = models_comparison.loc[best_model_idx, 'Model']
print(f"\nBest Model (based on R² Score): {best_model}")
print(f"R² Score: {models_comparison.loc[best_model_idx, 'R² Score']:.4f}")

# Visualization - Model Comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# R² Score comparison
axes[0, 0].barh(models_comparison['Model'], models_comparison['R² Score'], color='steelblue', alpha=0.7)
axes[0, 0].set_title('R² Score Comparison', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('R² Score', fontsize=10)
axes[0, 0].grid(True, alpha=0.3, axis='x')

# RMSE comparison
axes[0, 1].barh(models_comparison['Model'], models_comparison['RMSE'], color='coral', alpha=0.7)
axes[0, 1].set_title('RMSE Comparison', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('RMSE', fontsize=10)
axes[0, 1].grid(True, alpha=0.3, axis='x')

# MAE comparison
axes[1, 0].barh(models_comparison['Model'], models_comparison['MAE'], color='lightgreen', alpha=0.7)
axes[1, 0].set_title('MAE Comparison', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('MAE', fontsize=10)
axes[1, 0].grid(True, alpha=0.3, axis='x')

# All models visualization
axes[1, 1].scatter(X, y, color='red', s=100, alpha=0.7, label='Actual Data', zorder=5)
axes[1, 1].plot(X_grid, y_pred_lin_grid, color='blue', linewidth=1, label='Linear', alpha=0.5)
axes[1, 1].plot(X_grid, y_pred_poly_4_grid, color='green', linewidth=2, label='Poly (deg=4)', alpha=0.8)
axes[1, 1].plot(X_grid, y_pred_rf_grid, color='orange', linewidth=2, label='Random Forest', alpha=0.8)
axes[1, 1].set_title('All Models Comparison', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Position Level', fontsize=10)
axes[1, 1].set_ylabel('Salary', fontsize=10)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


## 7. Predictions for New Positions


In [None]:
# Predict salary for new position levels
new_levels = np.array([[6.5], [7.5], [8.5], [9.5], [11.0]])

# Using the best model (Polynomial Regression Degree 4)
predictions = lin_reg_poly_4.predict(poly_reg_4.transform(new_levels))

# Create predictions DataFrame
predictions_df = pd.DataFrame({
    'Level': new_levels.ravel(),
    'Predicted_Salary': predictions
})

print("=" * 60)
print("PREDICTIONS FOR NEW POSITION LEVELS")
print("=" * 60)
print(predictions_df.to_string(index=False))

# Visualization
plt.figure(figsize=(12, 6))
plt.scatter(X, y, color='red', s=100, alpha=0.7, label='Training Data', zorder=5)
plt.plot(X_grid, y_pred_poly_4_grid, color='blue', linewidth=2, label='Polynomial Regression (deg=4)')
plt.scatter(new_levels, predictions, color='green', s=150, marker='*', 
           edgecolors='black', linewidth=2, label='Predictions', zorder=6)
plt.title('Salary Predictions for New Position Levels', fontsize=14, fontweight='bold')
plt.xlabel('Position Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'predictions.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Save the best model
model_path = project_root / 'results' / 'models' / 'best_model_poly4.pkl'
model_path.parent.mkdir(parents=True, exist_ok=True)

with open(model_path, 'wb') as f:
    pickle.dump({
        'model': lin_reg_poly_4,
        'poly_features': poly_reg_4,
        'r2_score': r2_poly_4,
        'rmse': rmse_poly_4
    }, f)

print(f"Best model saved to: {model_path}")
print(f"\nModel Performance:")
print(f"  R² Score: {r2_poly_4:.4f}")
print(f"  RMSE: ${rmse_poly_4:,.2f}")
print(f"  MAE: ${mae_poly_4:,.2f}")

# Save predictions
predictions_path = project_root / 'results' / 'predictions.csv'
predictions_df.to_csv(predictions_path, index=False)
print(f"\nPredictions saved to: {predictions_path}")
