## Good Ressource
- https://github.com/solegalli/feature-engineering-for-machine-learning/tree/main/Section-08-Categorical-Encoding-Basic

I have several other files preprocessing the Canadian data from the stack overflow surveys that I use to train the model. However, I will not share them since it was done in the context of a college course, and thus I am scared that people might want to cheat on it.

In [334]:
import pandas as pd
import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib

In [335]:
basePath = ""

In [336]:
data = pd.read_csv(f"{basePath}/CanadaData.csv")
data

Unnamed: 0,Company Size,Industry,Experience,Title,Country,City,Salary
0,10 to 19 employees,"Information Services, IT, Software Development...",5 to 9 years,Data scientist or machine learning specialist,Canada,Halifax,52046.0
1,100 to 499 employees,"Manufacturing, Transportation, or Supply Chain",10 or more years,"Developer, full-stack",Canada,Montreal,64686.0
2,20 to 99 employees,"Information Services, IT, Software Development...",2 to 4 years,"Developer, full-stack",Canada,Hamilton–Niagara Peninsula,59481.0
3,20 to 99 employees,"Manufacturing, Transportation, or Supply Chain",5 to 9 years,Data or business analyst,Canada,Halifax,63199.0
4,100 to 499 employees,"Information Services, IT, Software Development...",5 to 9 years,"Developer, front-end",Canada,Montreal,53533.0
...,...,...,...,...,...,...,...
7033,100 to 499 employees,"Information Services, IT, Software Development...",10 or more years,"Developer, front-end",Canada,Toronto,90000.0
7034,20 to 99 employees,"Information Services, IT, Software Development...",10 or more years,"Developer, desktop or enterprise applications",Canada,Toronto,90000.0
7035,100 to 499 employees,"Information Services, IT, Software Development...",10 or more years,"Developer, front-end",Canada,Toronto,90000.0
7036,20 to 99 employees,"Information Services, IT, Software Development...",10 or more years,"Developer, front-end",Canada,Winnipeg,70000.0


In [337]:
data[(data["City"]=="Montreal")&(data["Title"]=="Developer, full-stack")&(data["Experience"]=="0 to 1 years")]

Unnamed: 0,Company Size,Industry,Experience,Title,Country,City,Salary
2878,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Canada,Montreal,31750.0
4353,20 to 99 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Canada,Montreal,32001.0
4542,20 to 99 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Canada,Montreal,32062.0
6117,20 to 99 employees,Healthcare,0 to 1 years,"Developer, full-stack",Canada,Montreal,32209.0


## Testing Input for Models

In [338]:
Company_Size = ['500 to 999 employees']*4
Experience = ['0 to 1 years', '2 to 4 years', '5 to 9 years','10 or more years']
Industry = ['Information Services, IT, Software Development, or other Technology']*4
Title = ['Developer, full-stack']*4
City = ['Montreal']*4
testData = {
    'Company Size': Company_Size,
    'Experience': Experience,
    'Industry':Industry,
    'Title': Title,
    'City': City
}

testDF = pd.DataFrame(testData)
testDF

Unnamed: 0,Company Size,Experience,Industry,Title,City
0,500 to 999 employees,0 to 1 years,"Information Services, IT, Software Development...","Developer, full-stack",Montreal
1,500 to 999 employees,2 to 4 years,"Information Services, IT, Software Development...","Developer, full-stack",Montreal
2,500 to 999 employees,5 to 9 years,"Information Services, IT, Software Development...","Developer, full-stack",Montreal
3,500 to 999 employees,10 or more years,"Information Services, IT, Software Development...","Developer, full-stack",Montreal


## Creating the training and testing dataset

In [339]:
# I will drop the Country column because all my data is from Canada which make this column redundant.

data = data.drop(columns=["Country"])

In [340]:
def GetTrainingData():
  X_train, X_test, y_train, y_test = train_test_split(
    data.drop("Salary", axis=1),  # predictors
    data["Salary"],  # target
    test_size=0.2,  # percentage of obs in test set
    random_state=42,  # seed to ensure reproducibility
)

  return (X_train, X_test, y_train, y_test)

### Column Transformer

In [341]:
CompanySizeOrderedCategories = ['2 to 9 employees', '10 to 19 employees', '20 to 99 employees',
     '100 to 499 employees', '500 to 999 employees', '1,000 to 4,999 employees',
     '5,000 to 9,999 employees', '10,000 or more employees']

ExperienceOrderedCategories = ['0 to 1 years', '2 to 4 years', '5 to 9 years','10 or more years']
transformer = ColumnTransformer(
    transformers=[
        ('oe_CompanySize', OrdinalEncoder(categories=[CompanySizeOrderedCategories]),['Company Size']),
        ('oe_Experience', OrdinalEncoder(categories=[ExperienceOrderedCategories]), ['Experience']),
        ('categorical', OneHotEncoder(categories="auto",drop=None,sparse_output=False,handle_unknown="infrequent_if_exist"), ["Industry","Title","City"])
    ],remainder="passthrough")

transformer.set_output(transform="pandas")

## Linear Regression: Pipeline Model Training - Without Scaling

### Get Training Data

In [None]:
X_train, X_test, y_train, y_test = GetTrainingData()

### Column Transformer

In [None]:
CompanySizeOrderedCategories = ['2 to 9 employees', '10 to 19 employees', '20 to 99 employees',
     '100 to 499 employees', '500 to 999 employees', '1,000 to 4,999 employees',
     '5,000 to 9,999 employees', '10,000 or more employees']

ExperienceOrderedCategories = ['0 to 1 years', '2 to 4 years', '5 to 9 years','10 or more years']
transformer = ColumnTransformer(
    transformers=[
        ('oe_CompanySize', OrdinalEncoder(categories=[CompanySizeOrderedCategories]),['Company Size']),
        ('oe_Experience', OrdinalEncoder(categories=[ExperienceOrderedCategories]), ['Experience']),
        ('categorical', OneHotEncoder(categories="auto",drop=None,sparse_output=False,handle_unknown="infrequent_if_exist"), ["Industry","Title","City"])
    ],remainder="passthrough")

transformer.set_output(transform="pandas")

In [None]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', LinearRegression())])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = predictions
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]

  Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]


Unnamed: 0,Company Size,Industry,Experience,Title,City,Salary,Predicted Salary
2876,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, back-end",Montreal,31750.0,19596.0
381,"10,000 or more employees",Healthcare,0 to 1 years,System administrator,Montreal,37176.0,24880.0
6632,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,Data or business analyst,Montreal,35000.0,20116.0
4353,20 to 99 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Montreal,32001.0,19160.0
3528,2 to 9 employees,"Manufacturing, Transportation, or Supply Chain",0 to 1 years,"Developer, front-end",Montreal,31765.0,16020.0
5780,"10,000 or more employees","Information Services, IT, Software Development...",0 to 1 years,"Developer, game or graphics",Montreal,36235.0,25588.0


### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, predictions)
math.sqrt(mse)

8273.561794749328

In [None]:
mae = mean_absolute_error(y_test, predictions)
mae

6247.286867252067

### Testing the Model With Custom Input

In [None]:
pipeline.predict(testDF)

array([19868., 35072., 50272., 65472.])

In [None]:
(pipeline.predict(testDF)*1.34)/(12*4*5*8)

array([13.86620833, 24.47733333, 35.08566667, 45.694     ])

## Linear Regression: Pipeline Model Training - With Standardization

### Get Training Data

In [None]:
X_train, X_test, y_train, y_test = GetTrainingData()

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Transform the test set using the same scaler
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))
y_train_scaled

array([[ 0.69037459],
       [-0.19544127],
       [-1.30216224],
       ...,
       [-1.06646392],
       [-1.77711343],
       [-0.74017673]])

In [None]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', LinearRegression())])

In [None]:
pipeline.fit(X_train, y_train_scaled)

In [None]:
# Make predictions on the standardized test set
y_pred_scaled = pipeline.predict(X_test)

# Inverse transform the predictions to get them back to the original scale
y_pred = scaler.inverse_transform(y_pred_scaled)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = y_pred
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]

  Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]


Unnamed: 0,Company Size,Industry,Experience,Title,City,Salary,Predicted Salary
2876,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, back-end",Montreal,31750.0,19622.254576
381,"10,000 or more employees",Healthcare,0 to 1 years,System administrator,Montreal,37176.0,24899.92474
6632,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,Data or business analyst,Montreal,35000.0,20121.998565
4353,20 to 99 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Montreal,32001.0,19159.874624
3528,2 to 9 employees,"Manufacturing, Transportation, or Supply Chain",0 to 1 years,"Developer, front-end",Montreal,31765.0,16021.295553
5780,"10,000 or more employees","Information Services, IT, Software Development...",0 to 1 years,"Developer, game or graphics",Montreal,36235.0,25563.136389


### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, y_pred)
math.sqrt(mse)

8274.63960297443

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

6247.812950748694

### Testing the Model With Custom Input

In [None]:
scaler.inverse_transform(pipeline.predict(testDF))

array([[19893.1438412 ],
       [35086.29520713],
       [50284.11707763],
       [65477.26844356]])

In [None]:
(scaler.inverse_transform(pipeline.predict(testDF))*1.34)/(12*4*5*8)

array([[13.88375664],
       [24.4873102 ],
       [35.09412338],
       [45.69767693]])

## Linear Regression: Pipeline Model Training - With MinMax

### Get Training Data

In [None]:
X_train, X_test, y_train, y_test = GetTrainingData()

In [None]:
# Initialize the StandardScaler
scaler = MinMaxScaler()

# Fit and transform the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Transform the test set using the same scaler
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))
y_train_scaled

array([[0.67441884],
       [0.50072773],
       [0.28372145],
       ...,
       [0.32993727],
       [0.19059284],
       [0.39391579]])

In [None]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', LinearRegression())])

In [None]:
pipeline.fit(X_train, y_train_scaled)

In [None]:
# Make predictions on the standardized test set
y_pred_scaled = pipeline.predict(X_test)

# Inverse transform the predictions to get them back to the original scale
y_pred = scaler.inverse_transform(y_pred_scaled)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = y_pred
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]

  Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]


Unnamed: 0,Company Size,Industry,Experience,Title,City,Salary,Predicted Salary
2876,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, back-end",Montreal,31750.0,19616.472656
381,"10,000 or more employees",Healthcare,0 to 1 years,System administrator,Montreal,37176.0,24886.500732
6632,100 to 499 employees,"Information Services, IT, Software Development...",0 to 1 years,Data or business analyst,Montreal,35000.0,20122.633545
4353,20 to 99 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Montreal,32001.0,19151.995605
3528,2 to 9 employees,"Manufacturing, Transportation, or Supply Chain",0 to 1 years,"Developer, front-end",Montreal,31765.0,16025.707764
5780,"10,000 or more employees","Information Services, IT, Software Development...",0 to 1 years,"Developer, game or graphics",Montreal,36235.0,25577.261475


### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, y_pred)
math.sqrt(mse)

8274.833738640984

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

6248.57634919537

### Testing the Model With Custom Input

In [None]:
scaler.inverse_transform(pipeline.predict(testDF))

array([[19884.44018555],
       [35081.17651367],
       [50277.9128418 ],
       [65474.64916992]])

In [None]:
(scaler.inverse_transform(pipeline.predict(testDF))*1.34)/(12*4*5*8)

array([[13.87768221],
       [24.48373778],
       [35.08979334],
       [45.6958489 ]])

## Random Forest: Pipeline Model Training - Without Scaling

### Get Training Data

In [342]:
X_train, X_test, y_train, y_test = GetTrainingData()

In [343]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

In [344]:
pipeline.fit(X_train, y_train)

In [345]:
predictions = pipeline.predict(X_test)

In [347]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = predictions
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Toronto"]

  Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Toronto"]


Unnamed: 0,Company Size,Industry,Experience,Title,City,Salary,Predicted Salary
6191,2 to 9 employees,Advertising Services,0 to 1 years,"Developer, mobile",Toronto,29390.0,28621.94
5632,20 to 99 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Toronto,28182.0,28955.262
3328,10 to 19 employees,"Information Services, IT, Software Development...",0 to 1 years,Academic researcher,Toronto,28128.0,36809.75
3297,"1,000 to 4,999 employees","Information Services, IT, Software Development...",0 to 1 years,"Developer, full-stack",Toronto,28320.0,29714.512
5313,2 to 9 employees,"Information Services, IT, Software Development...",0 to 1 years,"Developer, front-end",Toronto,28182.0,27835.61
1452,2 to 9 employees,"Manufacturing, Transportation, or Supply Chain",0 to 1 years,"Developer, back-end",Toronto,28116.0,29204.43
904,2 to 9 employees,"Information Services, IT, Software Development...",0 to 1 years,System administrator,Toronto,46850.0,28968.89
1244,"5,000 to 9,999 employees","Information Services, IT, Software Development...",0 to 1 years,Scientist,Toronto,35138.0,38218.56


### Analysing the Model Accuracy

In [348]:
mse = mean_squared_error(y_test, predictions)
math.sqrt(mse)

6397.078874817183

In [349]:
mae = mean_absolute_error(y_test, predictions)
mae

4365.896695222123

### Testing the Model With Custom Input

In [350]:
pipeline.predict(testDF)

array([32172.69      , 41072.76860714, 50590.59674334, 59146.80536233])

In [351]:
(pipeline.predict(testDF)*1.34)/(12*4*5*8)

array([22.45385656, 28.66536976, 35.30802064, 41.27954124])

## Output

In [None]:
joblib.dump(pipeline, f"{basePath}/Canada.joblib")