<a href="https://colab.research.google.com/github/IsaiahHanna/Career-Paths/blob/main/ModelSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Set Up Path and Directory

In [1]:
!git clone https://github.com/IsaiahHanna/Career-Paths.git
%cd Career-Paths

Cloning into 'Career-Paths'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (58/58), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 58 (delta 23), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (58/58), 14.08 MiB | 11.09 MiB/s, done.
Resolving deltas: 100% (23/23), done.
/content/Career-Paths


# Model Selection

#### Import Modules

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.preprocessing import MultiLabelBinarizer,StandardScaler
from sklearn.pipeline import Pipeline





#### Import Dataset

In [3]:
df = pd.read_csv("CareerPaths.csv")

In [4]:
stringCols = ['CREDENTIAL','GROUPBY','INSTITUTION','INSTITUTION_NAME','PROGRAM_AREA_NAME','PROGRAM_NAME']
for col in stringCols:
  if col == 'CREDENTIAL':
    encoded = pd.get_dummies(df[col], prefix=col)
    df = df.drop(col, axis=1)
    df = df.join(encoded)
  else:
    means = df.groupby(col)['SALARY_WAGE'].mean()
    df[col] = df[col].map(means)

for col in df.columns.tolist():
  if df[col].dtype == 'bool':
    df[col] = df[col].astype(int)

scaler = StandardScaler()

##### Split to testing and training sets

In [5]:
x = df.drop(columns = ["SALARY_WAGE",'PROGRAM_AREA'])
y = df['SALARY_WAGE']

In [6]:
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2,random_state = 42)

In [7]:
noNa = df.dropna()
x = noNa.drop(columns = ["SALARY_WAGE",'PROGRAM_AREA'])
y = noNa['SALARY_WAGE']
xTrainNoNa,xTestNoNa,yTrainNoNa,yTestNoNa = train_test_split(x,y,test_size=0.2,random_state = 42)
xTrainNoNa = scaler.fit_transform(xTrainNoNa)
xTestNoNa = scaler.transform(xTestNoNa)

### Test different models to see which is the best

#### Linear Regression

In [8]:
ln = LinearRegression()
ln.fit(xTrainNoNa,yTrainNoNa)
yPred = ln.predict(xTestNoNa)

In [9]:
mse = mean_squared_error(yTestNoNa,yPred)
mae = mean_absolute_error(yTestNoNa,yPred)
r2 = r2_score(yTestNoNa,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 10.208633461725793
Mean Absolute Error: 2.4407418117071225
R-squared: 0.7890143098210977


#### Lasso Regression

In [10]:
lasso = Lasso(alpha=0.1)
lasso.fit(xTrainNoNa,yTrainNoNa)
yPred = lasso.predict(xTestNoNa)

In [11]:
mse = mean_squared_error(yTestNoNa,yPred)
mae = mean_absolute_error(yTestNoNa,yPred)
r2 = r2_score(yTestNoNa,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 10.119444520265814
Mean Absolute Error: 2.394675503297902
R-squared: 0.7908576114187904


#### Ridge Regression

In [12]:
ridge = Ridge(alpha=0.1)
ridge.fit(xTrainNoNa,yTrainNoNa)
yPred = ridge.predict(xTestNoNa)

In [13]:
mse = mean_squared_error(yTestNoNa,yPred)
mae = mean_absolute_error(yTestNoNa,yPred)
r2 = r2_score(yTestNoNa,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 10.20913721291095
Mean Absolute Error: 2.44088669648748
R-squared: 0.7890038986047603


#### Decision Tree

In [14]:
regressor = DecisionTreeRegressor(random_state = 4)
regressor.fit(xTrain,yTrain)
yPred = regressor.predict(xTest)

In [15]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 14.463666115222205
Mean Absolute Error: 2.151268169660531
R-squared: 0.7906955116576067


##### Random Forest

In [16]:
regressor = RandomForestRegressor(n_estimators=10,oob_score=True,random_state=4)
regressor.fit(xTrain,yTrain)
yPred = regressor.predict(xTest)

  warn(


In [17]:
# Collect all relevant scores
oob_score = regressor.oob_score_
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Out-of-Bag Score: {oob_score}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


Out-of-Bag Score: 0.696405879276258
Mean Squared Error: 8.162811892970302
Mean Absolute Error: 1.6959348497872133
R-squared: 0.8818755111544481


##### HistGradientBoostingRegressor

In [18]:
hist = HistGradientBoostingRegressor(random_state = 4)
hist.fit(xTrain,yTrain)
yPred = hist.predict(xTest)

In [19]:
mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 7.3841707356699215
Mean Absolute Error: 1.7067176641767332
R-squared: 0.8931432691165574


#### Fine Tuning the Decision Tree Model


In [20]:
regressor = DecisionTreeRegressor(random_state = 42)

parameters = {
    'max_depth': [2, 4, 6, 8, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid = GridSearchCV(estimator=regressor, param_grid=parameters,
                           cv=5, scoring='neg_mean_squared_error', n_jobs=-1)


In [21]:
grid.fit(xTrain,yTrain)

  _data = np.array(data, dtype=dtype, copy=copy,


In [22]:
bestmodel = grid.best_estimator_
bestParams = grid.best_params_
yPred = bestmodel.predict(xTest)

In [23]:
print(f"Best Parameters (Grid Search): {bestParams}")

mse = mean_squared_error(yTest,yPred)
mae = mean_absolute_error(yTest,yPred)
r2 = r2_score(yTest,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Best Parameters (Grid Search): {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Mean Squared Error: 10.407957548194371
Mean Absolute Error: 1.8900851541166508
R-squared: 0.8493858879235676


#### Fine Tuning the Ridge Regression Model


In [24]:
ridge = Ridge()
rfe = RFE(estimator=ridge, n_features_to_select=10)
xRFE = rfe.fit_transform(xTrainNoNa,yTrainNoNa)



In [25]:
parameters = {
    'alpha': [0.001,0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(estimator = ridge,param_grid = parameters,cv = 5)

In [26]:
grid.fit(xRFE,yTrainNoNa)


In [27]:
bestModel = grid.best_estimator_
bestParams = grid.best_params_
yPred = bestModel.predict(rfe.transform(xTestNoNa))
selectedFeatureNames = [feature for feature, selected in zip(df.columns.tolist(), rfe.support_) if selected]

print(f"Best Parameters (Grid Search): {bestParams}")
print(f"Selected Features: {selectedFeatureNames}")

Best Parameters (Grid Search): {'alpha': 0.001}
Selected Features: ['PROGRAM_AREA', 'PROGRAM_AREA_NAME', 'RESPONSE_RATE', 'AGE', 'FEMALE', 'EMPLOY_FULL_TIME', 'CREDENTIAL_Apprenticeship', 'CREDENTIAL_Associate Degree', "CREDENTIAL_Bachelor's Degree", 'CREDENTIAL_Certificate']


In [151]:
mse = mean_squared_error(yTestNoNa,yPred)
mae = mean_absolute_error(yTestNoNa,yPred)
r2 = r2_score(yTestNoNa,yPred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

Mean Squared Error: 7.189333143890943
Mean Absolute Error: 1.757206820536756
R-squared: 0.8514153318387961
