In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data_cleaned.csv')

In [3]:
# Take a look of data
df.head()

Unnamed: 0,Job Title,Location,Job Description,Rating,Size,Founded,Type of ownership,Industry,Sector,Revenue,...,job_simp,employee_expriece,desc_len,python,excel,sql,tableau,spark,machine learning,aws
0,Business Analyst,"Plano, TX","Hello,\r\nWe are hiring for Business System An...",5.0,Unknown,-1,Company - Public,Unknown,Unknown,Unknown / Non-Applicable,...,business analyst,na,1459,0,1,1,0,0,0,0
1,Business Systems Analyst,"Irvine, CA",-1,5.0,Unknown,-1,Unknown,Unknown,Unknown,Unknown / Non-Applicable,...,na,na,2,0,0,0,0,0,0,0
2,Business/Database Analyst,"Newark, NJ",JSR has an immediate opening for their direct ...,5.0,51 to 200 Employees,2015,Company - Private,Information Technology Support Services,Information Technology,$5 to $25 million (USD),...,na,na,4188,0,1,1,0,0,0,1
3,Data Analyst,"New York, NY",Are you someone who loves crunching numbers an...,3.5,201 to 500 Employees,1972,Nonprofit Organization,Education & Training Services,Education,$5 to $25 million (USD),...,data analyst,na,3518,0,1,0,0,0,0,0
4,Junior Business/Data Analyst,"Washington, DC","Company Overview:\r\nThe Kenific Group, Inc. (...",2.6,51 to 200 Employees,-1,Company - Private,Business Consulting,Management & Consulting,Less than $1 million (USD),...,data analyst,junior_emp,3353,0,1,1,0,0,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 563 entries, 0 to 562
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          563 non-null    object 
 1   Location           563 non-null    object 
 2   Job Description    563 non-null    object 
 3   Rating             563 non-null    float64
 4   Size               563 non-null    object 
 5   Founded            563 non-null    int64  
 6   Type of ownership  563 non-null    object 
 7   Industry           563 non-null    object 
 8   Sector             563 non-null    object 
 9   Revenue            563 non-null    object 
 10  hourly             563 non-null    int64  
 11  employer_est       563 non-null    int64  
 12  glassdoor_est      563 non-null    int64  
 13  min_salary         563 non-null    float64
 14  max_salary         563 non-null    float64
 15  avg_salary         563 non-null    float64
 16  age                563 non

In [5]:
df.columns

Index(['Job Title', 'Location', 'Job Description', 'Rating', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue', 'hourly',
       'employer_est', 'glassdoor_est', 'min_salary', 'max_salary',
       'avg_salary', 'age', 'company_txt', 'State', 'job_simp',
       'employee_expriece', 'desc_len', 'python', 'excel', 'sql', 'tableau',
       'spark', 'machine learning', 'aws'],
      dtype='object')

In [6]:
df.describe()

Unnamed: 0,Rating,Founded,hourly,employer_est,glassdoor_est,min_salary,max_salary,avg_salary,age,desc_len,python,excel,sql,tableau,spark,machine learning,aws
count,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0,563.0
mean,3.670515,1457.793961,0.143872,0.577265,0.422735,98.753428,138.145471,118.449449,36.476021,3984.326821,0.461812,0.490231,0.523979,0.149201,0.17762,0.296625,0.273535
std,1.275076,869.017208,0.351272,0.494433,0.494433,39.176321,58.739585,47.370053,48.582623,2167.914802,0.498983,0.500349,0.499869,0.356603,0.382532,0.457176,0.446169
min,-1.0,-1.0,0.0,0.0,0.0,4.0,6.0,5.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.6,-1.0,0.0,0.0,0.0,70.0,98.5,83.5,-1.0,2410.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.9,1971.0,0.0,1.0,0.0,90.0,127.0,109.0,18.0,4092.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,4.3,2004.0,0.0,1.0,1.0,120.0,165.5,141.25,50.0,5391.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,5.0,2022.0,1.0,1.0,1.0,300.0,520.0,400.0,277.0,13226.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

# Data Preparation
# Selecting relevant features
features = ['Rating','Size','Type of ownership','Industry','Sector','Revenue','State','age','python','excel','aws','sql','tableau','spark','machine learning',
            'job_simp','employee_expriece','desc_len']
target = 'avg_salary'

In [8]:
# Handle missing values by filling with the mode for categorical columns and mean for numeric columns
for feature in features:
    if df[feature].dtype == 'object':
        df[feature].fillna(df[feature].mode()[0], inplace=True)
    else:
        df[feature].fillna(df[feature].mean(), inplace=True)

# Convert categorical variables using one-hot encoding
data_encoded = pd.get_dummies(df[features], drop_first=True)

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data_encoded, df[target], test_size=0.2, random_state=42)

In [9]:
# Model Selection and Training - Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

mae, rmse

(34.65127282158755, 57.730682065496545)

In [10]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# compare different models
models = {
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train and evaluate each model
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    results.append({
        'Model': name,
        'MAE': mae,
        'RMSE': rmse
    })

results_df = pd.DataFrame(results).sort_values(by='MAE')
results_df


Unnamed: 0,Model,MAE,RMSE
3,Random Forest,27.817089,54.676578
4,Gradient Boosting,28.540923,53.294219
1,Lasso Regression,30.365199,54.818178
0,Ridge Regression,31.292065,54.803365
2,Decision Tree,36.563097,68.209328


In [11]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters grid for Random Forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search with cross-validation for Random Forest
rf_grid = GridSearchCV(RandomForestRegressor(), rf_params, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Best parameters and score for Random Forest
rf_best_params = rf_grid.best_params_
rf_best_score = -rf_grid.best_score_  # Convert negative MAE to positive

rf_best_params, rf_best_score


({'max_depth': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 19.465840333333333)

In [12]:
# Hyperparameters grid for Gradient Boosting
gb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7]
}

# Grid search with cross-validation for Gradient Boosting
gb_grid = GridSearchCV(GradientBoostingRegressor(), gb_params, 
                       cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
gb_grid.fit(X_train, y_train)

# Best parameters and score for Gradient Boosting
gb_best_params = gb_grid.best_params_
gb_best_score = -gb_grid.best_score_  # Convert negative MAE to positive

gb_best_params, gb_best_score

Fitting 3 folds for each of 18 candidates, totalling 54 fits


({'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100},
 19.101420865505876)

In [13]:
# Predict using the tuned Random Forest model
rf_best_model = rf_grid.best_estimator_
rf_test_predictions = rf_best_model.predict(X_test)
rf_mae_test = mean_absolute_error(y_test, rf_test_predictions)
rf_rmse_test = mean_squared_error(y_test, rf_test_predictions, squared=False)

# Predict using the tuned Gradient Boosting model
gb_best_model = gb_grid.best_estimator_
gb_test_predictions = gb_best_model.predict(X_test)
gb_mae_test = mean_absolute_error(y_test, gb_test_predictions)
gb_rmse_test = mean_squared_error(y_test, gb_test_predictions, squared=False)

results_test = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting'],
    'Test MAE': [rf_mae_test, gb_mae_test],
    'Test RMSE': [rf_rmse_test, gb_rmse_test]
})

results_test

Unnamed: 0,Model,Test MAE,Test RMSE
0,Random Forest,27.840767,53.933533
1,Gradient Boosting,29.769306,56.912239


In [14]:
# pickle model for prediction
import pickle
pickl = {'model': rf_best_model}
pickle.dump( pickl, open( 'model_file' + ".p", "wb" ) )

file_name = "model_file.p"
with open(file_name, 'rb') as pickled:
    data = pickle.load(pickled)
    model = data['model']

In [15]:
model.predict(np.array(list(X_test.iloc[1,:])).reshape(1,-1))[0]



127.2525

In [16]:
list(X_test.iloc[1,:])

[4.2,
 47.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 4392.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0]