In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/jobs-dataset-from-glassdoor/eda_data.csv")
df.head(20)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
#Categorical columns & numerical columns
categorical_cols = df.select_dtypes(include='object').columns.to_list()
numerical_cols = df.select_dtypes(include=['int64','float64']).columns.to_list()
print(categorical_cols)
print(numerical_cols)

In [None]:
X = df.drop('avg_salary',axis=1)
y = df['avg_salary']

In [None]:
categorical_cols_x = X.select_dtypes(include='object').columns.to_list()
numerical_cols_x = X.select_dtypes(include=['int64','float64']).columns.to_list()

In [None]:
#encoding & standardization 
from sklearn.preprocessing import StandardScaler

# One-hot encoding for categorical columns
X_encoded = pd.get_dummies(X[categorical_cols_x])

# Standard scaling for numerical columns
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_cols_x]), columns=numerical_cols_x)

# Combine encoded and scaled dataframes
X_processed = pd.concat([X_encoded, X_scaled], axis=1)

In [None]:
X_processed.shape

In [None]:
from sklearn.decomposition import PCA

# Initialize PCA - let's keep enough components to explain 95% of variance
pca = PCA(n_components=0.95)

# Fit and transform the data
X_pca = pd.DataFrame(pca.fit_transform(X_processed))

# Print explained variance ratio and number of components
print(f"Number of components selected: {pca.n_components_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2f}")

# Plot explained variance ratio
plt.figure(figsize=(10,6))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Explained Variance vs Number of Components')
plt.grid(True)
plt.show()

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_pca,y,test_size=0.3,random_state=42)

In [None]:
#implement models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet


In [None]:
# Initialize all models with parameters
lr = LinearRegression()

ridge = Ridge(
    alpha=1.0,
    random_state=42
)

lasso = Lasso(
    alpha=1.0,
    random_state=42
)

elastic = ElasticNet(
    alpha=1.0,
    l1_ratio=0.5,
    random_state=42
)

dt = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=5,
    random_state=42
)

xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

knn = KNeighborsRegressor(
    n_neighbors=5,
    weights='uniform'
)

rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=42
)

ada = AdaBoostRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Dictionary to store model performances
model_performances = {}

# Fit and evaluate each model
models = {
    'Linear Regression': lr,
    'Lasso Regression' : lasso,
    'Ridge Regression' : ridge,
    'ElasticNet' : elastic,
    'Decision Tree': dt,
    'Random Forest': rf,
    'XGBoost': xgb,
    'KNN': knn,
    'AdaBoost': ada
}

for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    model_performances[name] = rmse
    
    print(f"{name} RMSE: {rmse:.2f}")

# Plot the results
plt.figure(figsize=(10,6))
plt.bar(model_performances.keys(), model_performances.values())
plt.title('Model Performances (RMSE)')
plt.xticks(rotation=45)
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
from sklearn.linear_model import Lasso

# Define RMSE scorer
rmse_scorer = make_scorer(lambda y, y_pred: np.sqrt(mean_squared_error(y, y_pred)))

# Parameter grids for each model
param_grid_lr = {
    'fit_intercept': [True],
    'positive': [False]
}

param_grid_dt = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [100, 200],  # Reduced from [100, 200, 300]
    'max_depth': [5, 10],        # Reduced from [5, 10, 15]
    'min_samples_split': [5, 10]  # Reduced options
}

param_grid_xgb = {
    'n_estimators': [100],       # Fixed value instead of [100, 200]
    'max_depth': [3, 5],         # Reduced from [3, 5, 7]
    'learning_rate': [0.1, 0.3]  # Removed smallest learning rate
}

param_grid_ada = {
    'n_estimators': [50, 100],   # Removed 200
    'learning_rate': [0.1, 0.3], # Removed smallest learning rate
    'loss': ['linear', 'square'] # Removed exponential
}


param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Manhattan or Euclidean distance
}

param_grid_ridge = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_intercept': [True],
    'positive': [False]
}

param_grid_lasso = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_intercept': [True],
    'positive': [False]
}

param_grid_elastic = {
    'alpha': [0.1, 1.0, 10.0],
    'l1_ratio': [0.2, 0.5, 0.8],
    'fit_intercept': [True],
    'positive': [False]
}

# Create GridSearchCV objects
grid_searches = {
    'Linear Regression': GridSearchCV(lr, param_grid_lr, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'Ridge Regression': GridSearchCV(ridge, param_grid_ridge, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'Lasso Regression': GridSearchCV(lasso, param_grid_lasso, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'ElasticNet': GridSearchCV(elastic, param_grid_elastic, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'Decision Tree': GridSearchCV(dt, param_grid_dt, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'Random Forest': GridSearchCV(rf, param_grid_rf, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'XGBoost': GridSearchCV(xgb, param_grid_xgb, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'KNN': GridSearchCV(knn, param_grid_knn, scoring=rmse_scorer, cv=5, n_jobs=-1),
    'AdaBoost': GridSearchCV(ada, param_grid_ada, scoring=rmse_scorer, cv=5, n_jobs=-1)
}

# Fit all models and store results
best_params = {}
best_scores = {}

for name, grid_search in grid_searches.items():
    print(f"\nTraining {name}...")
    try:
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error with {name}: {str(e)}")
        continue
    best_params[name] = grid_search.best_params_
    best_scores[name] = -grid_search.best_score_  # Negative because of scorer
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best RMSE: {-grid_search.best_score_:.2f}")

# Plot results
plt.figure(figsize=(10,6))
plt.bar(best_scores.keys(), best_scores.values())
plt.title('Model Performances After Hyperparameter Tuning (RMSE)')
plt.xticks(rotation=45)
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()

In [None]:
# Get the best Ridge model from grid search
best_ridge = grid_searches['Ridge Regression'].best_estimator_

# Make predictions on test set
y_pred_ridge = best_ridge.predict(X_test)

# Plot actual vs predicted values
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred_ridge, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Salary')
plt.ylabel('Predicted Salary')
plt.title('Ridge Regression: Actual vs Predicted Salary')
plt.tight_layout()
plt.show()

# Print the best parameters and RMSE
print("Best Parameters:", grid_searches['Ridge Regression'].best_params_)
print("RMSE on test set:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# First identify feature columns excluding target
feature_cols = [col for col in df.columns if col != 'avg_salary']

# Identify categorical and numerical columns
categorical_cols = df[feature_cols].select_dtypes(include='object').columns.to_list()
numerical_cols = df[feature_cols].select_dtypes(include=['int64','float64']).columns.to_list()

# Create preprocessing steps with identified columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Create the complete pipeline
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('ridge', Ridge(alpha=best_params['Ridge Regression']['alpha'], 
                   random_state=42))
])

# Fit pipeline and make predictions
ridge_pipeline.fit(X, y)
example_input = X.iloc[0:1].copy()
prediction = ridge_pipeline.predict(example_input)

# Print column information and results
print("Features used in pipeline:")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print("\nPrediction Results:")
print("-" * 50)
print(f"Predicted Salary: ${prediction[0]:,.2f}k")
print(f"Actual Salary: ${y.iloc[0]:,.2f}k")