<a href="https://colab.research.google.com/github/IsaiahHanna/Career-Paths/blob/main/ModelSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set Up Path and Directory

In [None]:
!git clone https://github.com/IsaiahHanna/Career-Paths.git
%cd Career-Paths

Cloning into 'Career-Paths'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 45 (delta 15), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (45/45), 13.97 MiB | 10.51 MiB/s, done.
Resolving deltas: 100% (15/15), done.
/content/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths/Career-Paths


# Model Selection

#### Import Modules

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.preprocessing import MultiLabelBinarizer



#### Import Dataset

In [None]:
df = pd.read_csv("CareerPaths.csv")

In [None]:
stringCols = ['CREDENTIAL','GROUPBY','INSTITUTION','INSTITUTION_NAME','PROGRAM_AREA_NAME','PROGRAM_NAME']
for col in stringCols:
  encoded = pd.get_dummies(df[col], prefix=col)
  df = df.drop(col, axis=1)
  df = df.join(encoded)

for col in df.columns.tolist():
  if df[col].dtype == 'bool':
    df[col] = df[col].astype(int)

##### Split to testing and training sets

In [None]:
x = df.drop(columns = ["SALARY_WAGE",'PROGRAM_AREA'])
y = df['SALARY_WAGE']

In [None]:
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state = 42)

### Test different models to see which is the best

#### Decision Tree

In [None]:
regressor = DecisionTreeRegressor(random_state = 4)
regressor.fit(xTrain,yTrain)
yPred = regressor.predict(xTest)

In [None]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 48.98635320663116
Mean Absolute Error: 4.22561384736243
R-squared: 0.2911158545838586


##### Random Forest

In [None]:
regressor = RandomForestRegressor(n_estimators=10,oob_score=True,random_state=4)
regressor.fit(xTrain,yTrain)
yPred = regressor.predict(xTest)

  warn(


In [None]:
# Collect all relevant scores
oob_score = regressor.oob_score_
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Out-of-Bag Score: {oob_score}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Out-of-Bag Score: 0.41519586454757296
Mean Squared Error: 25.781607750377916
Mean Absolute Error: 3.296822125334026
R-squared: 0.6269129710372721


##### HistGradientBoostingRegressor

In [None]:
hist = HistGradientBoostingRegressor(random_state = 4)
hist.fit(xTrain,yTrain)
yPred = hist.predict(xTest)

In [None]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 24.675458878726484
Mean Absolute Error: 3.1410955723336675
R-squared: 0.6429201107048461


#### Fine Tuning the Decision Tree Model

In [None]:
parameters = {
    'max_depth' : [10,20,30,None],
    'min_samples_split' : [2,5,10],
    'min_samples_leaf' : [1,2,4],
    'criterion' : ['squared_error','absolute_error','friedman_mse','poisson']
}

regressor = DecisionTreeRegressor(random_state=4)
grid = GridSearchCV(regressor,parameters,cv = 5,
                            n_jobs = -1,verbose= 2,scoring = 'neg_mean_squared_error')

In [None]:
noNa = df.dropna()
x = noNa.drop(columns = ["SALARY_WAGE",'PROGRAM_AREA'])
y = noNa['SALARY_WAGE']
xTrainNoNa,xTestNoNa,yTrainNoNa,yTestNoNa = train_test_split(x,y,test_size=0.2,random_state = 42)

In [None]:
grid.fit(xTrainNoNa,yTrainNoNa)
bestregressor = grid.best_estimator_


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters (Random Search): {'criterion': 'squared_error', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [None]:
yPred = bestregressor.predict(xTest)
bestParams = grid.best_params_

print(f"Best Parameters (Grid Search): {bestParams}")

Best Parameters (Grid Search): {'criterion': 'squared_error', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [None]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 33.43054028046824
Mean Absolute Error: 3.3491955680573238
R-squared: 0.5162248580220576
