## Good Ressource
- https://github.com/solegalli/feature-engineering-for-machine-learning/tree/main/Section-08-Categorical-Encoding-Basic

I have several other files preprocessing the Canadian data from the stack overflow surveys that I use to train the model. However, I will not share them since it was done in the context of a college course, and thus I am scared that people might want to cheat on it.

In [334]:
import pandas as pd
import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib

In [335]:
basePath = ""

In [None]:
data = pd.read_csv(f"{basePath}/CanadaData.csv")
data

In [None]:
data[(data["City"]=="Montreal")&(data["Title"]=="Developer, full-stack")&(data["Experience"]=="0 to 1 years")]

## Testing Input for Models

In [None]:
Company_Size = ['500 to 999 employees']*4
Experience = ['0 to 1 years', '2 to 4 years', '5 to 9 years','10 or more years']
Industry = ['Information Services, IT, Software Development, or other Technology']*4
Title = ['Developer, full-stack']*4
City = ['Montreal']*4
testData = {
    'Company Size': Company_Size,
    'Experience': Experience,
    'Industry':Industry,
    'Title': Title,
    'City': City
}

testDF = pd.DataFrame(testData)
testDF

## Creating the training and testing dataset

In [339]:
# I will drop the Country column because all my data is from Canada which make this column redundant.

data = data.drop(columns=["Country"])

In [340]:
def GetTrainingData():
  X_train, X_test, y_train, y_test = train_test_split(
    data.drop("Salary", axis=1),  # predictors
    data["Salary"],  # target
    test_size=0.2,  # percentage of obs in test set
    random_state=42,  # seed to ensure reproducibility
)

  return (X_train, X_test, y_train, y_test)

### Column Transformer

In [None]:
CompanySizeOrderedCategories = ['2 to 9 employees', '10 to 19 employees', '20 to 99 employees',
     '100 to 499 employees', '500 to 999 employees', '1,000 to 4,999 employees',
     '5,000 to 9,999 employees', '10,000 or more employees']

ExperienceOrderedCategories = ['0 to 1 years', '2 to 4 years', '5 to 9 years','10 or more years']
transformer = ColumnTransformer(
    transformers=[
        ('oe_CompanySize', OrdinalEncoder(categories=[CompanySizeOrderedCategories]),['Company Size']),
        ('oe_Experience', OrdinalEncoder(categories=[ExperienceOrderedCategories]), ['Experience']),
        ('categorical', OneHotEncoder(categories="auto",drop=None,sparse_output=False,handle_unknown="infrequent_if_exist"), ["Industry","Title","City"])
    ],remainder="passthrough")

transformer.set_output(transform="pandas")

## Linear Regression: Pipeline Model Training - Without Scaling

### Get Training Data

In [None]:
X_train, X_test, y_train, y_test = GetTrainingData()

### Column Transformer

In [None]:
CompanySizeOrderedCategories = ['2 to 9 employees', '10 to 19 employees', '20 to 99 employees',
     '100 to 499 employees', '500 to 999 employees', '1,000 to 4,999 employees',
     '5,000 to 9,999 employees', '10,000 or more employees']

ExperienceOrderedCategories = ['0 to 1 years', '2 to 4 years', '5 to 9 years','10 or more years']
transformer = ColumnTransformer(
    transformers=[
        ('oe_CompanySize', OrdinalEncoder(categories=[CompanySizeOrderedCategories]),['Company Size']),
        ('oe_Experience', OrdinalEncoder(categories=[ExperienceOrderedCategories]), ['Experience']),
        ('categorical', OneHotEncoder(categories="auto",drop=None,sparse_output=False,handle_unknown="infrequent_if_exist"), ["Industry","Title","City"])
    ],remainder="passthrough")

transformer.set_output(transform="pandas")

In [None]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', LinearRegression())])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = predictions
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]

### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, predictions)
math.sqrt(mse)

In [None]:
mae = mean_absolute_error(y_test, predictions)
mae

### Testing the Model With Custom Input

In [None]:
pipeline.predict(testDF)

In [None]:
(pipeline.predict(testDF)*1.34)/(12*4*5*8)

## Linear Regression: Pipeline Model Training - With Standardization

### Get Training Data

In [None]:
X_train, X_test, y_train, y_test = GetTrainingData()

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Transform the test set using the same scaler
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))
y_train_scaled

In [None]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', LinearRegression())])

In [None]:
pipeline.fit(X_train, y_train_scaled)

In [None]:
# Make predictions on the standardized test set
y_pred_scaled = pipeline.predict(X_test)

# Inverse transform the predictions to get them back to the original scale
y_pred = scaler.inverse_transform(y_pred_scaled)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = y_pred
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]

### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, y_pred)
math.sqrt(mse)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

### Testing the Model With Custom Input

In [None]:
scaler.inverse_transform(pipeline.predict(testDF))

In [None]:
(scaler.inverse_transform(pipeline.predict(testDF))*1.34)/(12*4*5*8)

## Linear Regression: Pipeline Model Training - With MinMax

### Get Training Data

In [None]:
X_train, X_test, y_train, y_test = GetTrainingData()

In [None]:
# Initialize the StandardScaler
scaler = MinMaxScaler()

# Fit and transform the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Transform the test set using the same scaler
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))
y_train_scaled

In [None]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', LinearRegression())])

In [None]:
pipeline.fit(X_train, y_train_scaled)

In [None]:
# Make predictions on the standardized test set
y_pred_scaled = pipeline.predict(X_test)

# Inverse transform the predictions to get them back to the original scale
y_pred = scaler.inverse_transform(y_pred_scaled)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = y_pred
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Montreal"]

### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, y_pred)
math.sqrt(mse)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

### Testing the Model With Custom Input

In [None]:
scaler.inverse_transform(pipeline.predict(testDF))

In [None]:
(scaler.inverse_transform(pipeline.predict(testDF))*1.34)/(12*4*5*8)

## Random Forest: Pipeline Model Training - Without Scaling

### Get Training Data

In [342]:
X_train, X_test, y_train, y_test = GetTrainingData()

In [343]:
pipeline = Pipeline(steps=[('preprocessor', transformer),
                             ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

In [None]:
pipeline.fit(X_train, y_train)

In [345]:
predictions = pipeline.predict(X_test)

In [None]:
Copy_X_test = X_test.copy()
Copy_X_test["Salary"] = y_test
Copy_X_test["Predicted Salary"] = predictions
Copy_X_test[Copy_X_test["Experience"]=="0 to 1 years"][Copy_X_test["City"]=="Toronto"]

### Analysing the Model Accuracy

In [None]:
mse = mean_squared_error(y_test, predictions)
math.sqrt(mse)

In [None]:
mae = mean_absolute_error(y_test, predictions)
mae

### Testing the Model With Custom Input

In [None]:
pipeline.predict(testDF)

In [None]:
(pipeline.predict(testDF)*1.34)/(12*4*5*8)

## Output

In [None]:
joblib.dump(pipeline, f"{basePath}/Canada.joblib")