<a href="https://colab.research.google.com/github/IsaiahHanna/Career-Paths/blob/main/ModelSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set Up Path and Directory

In [None]:
!git clone https://github.com/IsaiahHanna/Career-Paths.git
%cd Career-Paths

Cloning into 'Career-Paths'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 48 (delta 17), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (48/48), 13.98 MiB | 10.74 MiB/s, done.
Resolving deltas: 100% (17/17), done.
/content/Career-Paths/Career-Paths


# Model Selection

#### Import Modules

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.preprocessing import MultiLabelBinarizer,PolynomialFeatures
from sklearn.pipeline import Pipeline



#### Import Dataset

In [None]:
df = pd.read_csv("CareerPaths.csv")

In [None]:
stringCols = ['CREDENTIAL','GROUPBY','INSTITUTION','INSTITUTION_NAME','PROGRAM_AREA_NAME','PROGRAM_NAME']
for col in stringCols:
  encoded = pd.get_dummies(df[col], prefix=col)
  df = df.drop(col, axis=1)
  df = df.join(encoded)

for col in df.columns.tolist():
  if df[col].dtype == 'bool':
    df[col] = df[col].astype(int)

##### Split to testing and training sets

In [None]:
x = df.drop(columns = ["SALARY_WAGE",'PROGRAM_AREA'])
y = df['SALARY_WAGE']

In [None]:
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state = 42)

In [None]:
noNa = df.dropna()
x = noNa.drop(columns = ["SALARY_WAGE",'PROGRAM_AREA'])
y = noNa['SALARY_WAGE']
xTrainNoNa,xTestNoNa,yTrainNoNa,yTestNoNa = train_test_split(x,y,test_size=0.2,random_state = 42)

### Test different models to see which is the best

#### Lasso Regression

In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(xTrainNoNa,yTrainNoNa)
yPred = lasso.predict(xTestNoNa)

In [None]:
mse = mean_squared_error(yTestNoNa,yPred)
mae = mean_absolute_error(yTestNoNa,yPred)
r2 = r2_score(yTestNoNa,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 21.681659621824704
Mean Absolute Error: 3.275886268923326
R-squared: 0.5518969373633098


#### Ridge Regression

In [None]:
ridge = Ridge(alpha=0.1)
ridge.fit(xTrainNoNa,yTrainNoNa)
yPred = ridge.predict(xTestNoNa)

In [None]:
mse = mean_squared_error(yTestNoNa,yPred)
mae = mean_absolute_error(yTestNoNa,yPred)
r2 = r2_score(yTestNoNa,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 18.67938651304906
Mean Absolute Error: 3.0094392180735574
R-squared: 0.6139460516091556


#### Decision Tree

In [None]:
regressor = DecisionTreeRegressor(random_state = 4)
regressor.fit(xTrain,yTrain)
yPred = regressor.predict(xTest)

In [None]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 48.98635320663116
Mean Absolute Error: 4.22561384736243
R-squared: 0.2911158545838586


##### Random Forest

In [None]:
regressor = RandomForestRegressor(n_estimators=10,oob_score=True,random_state=4)
regressor.fit(xTrain,yTrain)
yPred = regressor.predict(xTest)

  warn(


In [None]:
# Collect all relevant scores
oob_score = regressor.oob_score_
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Out-of-Bag Score: {oob_score}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Out-of-Bag Score: 0.41519586454757296
Mean Squared Error: 25.781607750377916
Mean Absolute Error: 3.296822125334026
R-squared: 0.6269129710372721


##### HistGradientBoostingRegressor

In [None]:
hist = HistGradientBoostingRegressor(random_state = 4)
hist.fit(xTrain,yTrain)
yPred = hist.predict(xTest)

In [None]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 24.675458878726484
Mean Absolute Error: 3.1410955723336675
R-squared: 0.6429201107048461


#### Fine Tuning the Ridge Regression Model


In [None]:
ridge = Ridge()
rfe = RFE(estimator=ridge, n_features_to_select=10)

pipeline = Pipeline([
    ('rfe',rfe),
    ('ridge',ridge)
])


In [None]:
parameters = {
    'ridge__alpha': [0.001,0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(pipeline,parameters,cv = 5)

In [None]:
grid.fit(xTrainNoNa,yTrainNoNa)


In [None]:
bestModel = grid.best_estimator_
bestParams = grid.best_params_
yPred = bestModel.predict(xTestNoNa)
print(f"Best Parameters (Grid Search): {bestParams}")
print(f"Selected Features: {rfe.support_}")

In [None]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")