In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
import json
import time
import joblib

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV
from sklearn import tree

#from forex_python.converter import CurrencyRates

###RandomizedSearchCV

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [16]:
def evaluate_regression_model(model, X_test, y_test):
    """
    Evaluates the performance of a regression model and provides a summary.

    Parameters:
        model: A trained regression model from scikit-learn.
        X_test: Test features.
        y_test: Test labels.
    """
    # Predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE
    r2 = r2_score(y_test, y_pred)
    
    # Display results
    print("Regression Model Evaluation:")
    print("===========================")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (R2): {r2:.2f}")

In [20]:
df = pd.read_csv('https://raw.githubusercontent.com/KKozakiewicz/predict_salaries/main/df_pln.csv')

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1274 entries, 0 to 1273
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   title                         1274 non-null   object 
 1   street                        1274 non-null   object 
 2   city                          1274 non-null   object 
 3   address_text                  1274 non-null   object 
 4   marker_icon                   1274 non-null   object 
 5   company_name                  1274 non-null   object 
 6   company_url                   1274 non-null   object 
 7   company_size                  1274 non-null   int64  
 8   experience_level              1274 non-null   int64  
 9   latitude                      1274 non-null   float64
 10  longitude                     1274 non-null   float64
 11  published_at                  1274 non-null   object 
 12  remote_interview              1274 non-null   bool   
 13  ope

In [22]:
#remove columns that won't be taken into consideration
df=df.drop(columns = ['marker_icon','title','company_size', 'city','street','address_text','company_name','company_url', 'latitude','longitude', 'published_at','remote_interview', 'open_to_hire_ukrainians', 'id','display_offer', 'company_logo_url', 'skills','multilocation','way_of_apply', 'currency'  ])

In [23]:
#extract rows with no salary information
df_no_salaries = df[df['to'].isna()].reset_index()
df_no_salaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   index                         499 non-null    int64  
 1   experience_level              499 non-null    int64  
 2   remote                        499 non-null    bool   
 3   from                          0 non-null      float64
 4   to                            0 non-null      float64
 5   AI                            499 non-null    float64
 6   AWS                           499 non-null    float64
 7   Analytical Thinking           499 non-null    float64
 8   BI                            499 non-null    float64
 9   Big Data                      499 non-null    float64
 10  DWH                           499 non-null    float64
 11  Data                          499 non-null    float64
 12  Databases                     499 non-null    float64
 13  ETL  

In [24]:
#extract rows with salary information
df = df[df['to'].notna()].reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775 entries, 0 to 774
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   index                         775 non-null    int64  
 1   experience_level              775 non-null    int64  
 2   remote                        775 non-null    bool   
 3   from                          775 non-null    float64
 4   to                            775 non-null    float64
 5   AI                            775 non-null    float64
 6   AWS                           775 non-null    float64
 7   Analytical Thinking           775 non-null    float64
 8   BI                            775 non-null    float64
 9   Big Data                      775 non-null    float64
 10  DWH                           775 non-null    float64
 11  Data                          775 non-null    float64
 12  Databases                     775 non-null    float64
 13  ETL  

In [25]:
X = df.drop(columns=['from','to'])
y = df['from']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## K-Nearest Neighbour(KNN)

In [30]:
from sklearn.neighbors import KNeighborsRegressor
# Initialize the KNeighborsRegressor model
model = KNeighborsRegressor()

# Define the parameter grid for the grid search
param_grid = {
    'n_neighbors': range(1, 7, 2),
    'weights': ['uniform', 'distance']
}

# Create a GridSearchCV object
gs = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_absolute_error')

# Fit the GridSearchCV object to the training data
gs.fit(X_train, y_train)

# Get the best model with tuned hyperparameters
best_model = gs.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_train)

# Calculate mean absolute error (MAE)
mae = mean_absolute_error(y_train, y_pred)
print("Best Model:", best_model)
print("Best Parameters:", gs.best_params_)
print("Mean Absolute Error:", mae)

Best Model: KNeighborsRegressor(n_neighbors=3, weights='distance')
Best Parameters: {'n_neighbors': 3, 'weights': 'distance'}
Mean Absolute Error: 0.0


In [32]:
evaluate_regression_model(best_model, X_test, y_test)

Regression Model Evaluation:
Mean Absolute Error (MAE): 1034.21
Mean Squared Error (MSE): 5576553.31
Root Mean Squared Error (RMSE): 2361.47
R-squared (R2): 0.88


## Random Forest

In [36]:
# Initialize the RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Define the parameter grid for the grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object
gs = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error')

# Fit the GridSearchCV object to the training data
gs.fit(X_train, y_train)

# Get the best model with tuned hyperparameters
best_model = gs.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate mean absolute error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print("Best Model:", best_model)
print("Best Parameters:", gs.best_params_)
print("Mean Absolute Error:", mae)

Best Model: RandomForestRegressor(max_depth=20, random_state=42)
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Mean Absolute Error: 359.70374193548383


In [37]:
evaluate_regression_model(best_model, X_test, y_test)

Regression Model Evaluation:
Mean Absolute Error (MAE): 359.70
Mean Squared Error (MSE): 1387206.29
Root Mean Squared Error (RMSE): 1177.80
R-squared (R2): 0.97


In [39]:
joblib.dump(model, 'trained_random_forest_model.joblib')

['trained_random_forest_model.joblib']

In [None]:
#loaded_model = joblib.load('trained_model.joblib')