In [13]:
import pandas as pd
import numpy as np

In [188]:
file_path = "gs://cloud-ai-platform-5734b953-00ea-4818-b5b7-3420cd51a628/salaries_new.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,data engineer,196000,USD,196000,US,0,US,M
1,2023,SE,FT,data engineer,121000,USD,121000,US,0,US,M
2,2023,MI,FT,data analyst,65000,USD,65000,US,100,US,M
3,2023,MI,FT,data analyst,55000,USD,55000,US,100,US,M
4,2023,SE,FT,machine learning engineer,220000,USD,220000,US,100,US,M


# Data Preprocessing

In [245]:

columns_to_drop = ['salary', 'salary_currency','employee_residence','company_location']
new_df=df.drop(columns=columns_to_drop)
new_df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,remote_ratio,company_size
0,2023,SE,FT,data engineer,196000,0,M
1,2023,SE,FT,data engineer,121000,0,M
2,2023,MI,FT,data analyst,65000,100,M
3,2023,MI,FT,data analyst,55000,100,M
4,2023,SE,FT,machine learning engineer,220000,100,M


In [250]:
def remove_outliers(df, column_name, threshold=1.5):
    Q1 = new_df[column_name].quantile(0.25)
    Q3 = new_df[column_name].quantile(0.75)
    
    IQR = Q3 - Q1
    
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    
    filtered_df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
    
    return filtered_df
new_df = remove_outliers(new_df, 'salary_in_usd')
new_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3331 entries, 0 to 3377
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   work_year         3331 non-null   int64 
 1   experience_level  3331 non-null   object
 2   employment_type   3331 non-null   object
 3   job_title         3331 non-null   object
 4   salary_in_usd     3331 non-null   int64 
 5   remote_ratio      3331 non-null   int64 
 6   company_size      3331 non-null   object
dtypes: int64(3), object(4)
memory usage: 208.2+ KB


## Determining Salary Ranges


In [275]:
max_salary = new_df['salary_in_usd'].max()
min_salary = new_df['salary_in_usd'].min()

num_subranges = 8

subranges = np.linspace(min_salary, max_salary, num=num_subranges+1, endpoint=True)
range_labels = []
for i in range(len(subranges)-1):
     subrange_min = int(subranges[i])
     subrange_max = int(subranges[i+1])
     range_label = f"{subrange_min:,} - {subrange_max:,}"
     range_labels.append(range_label)

range_labels

['15,000 - 48,875',
 '48,875 - 82,750',
 '82,750 - 116,625',
 '116,625 - 150,500',
 '150,500 - 184,375',
 '184,375 - 218,250',
 '218,250 - 252,125',
 '252,125 - 286,000']

In [252]:

categorical_cols=['experience_level','employment_type','job_title','company_size']
encoded_df = pd.get_dummies(new_df[categorical_cols], prefix=categorical_cols, prefix_sep='_')
df_encoded = pd.concat([new_df.drop(categorical_cols, axis=1), encoded_df], axis=1)
df_encoded.head()

Unnamed: 0,work_year,salary_in_usd,remote_ratio,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,job_title_data analyst,job_title_data engineer,job_title_data scientist,job_title_machine learning engineer,company_size_L,company_size_M,company_size_S
0,2023,196000,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
1,2023,121000,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
2,2023,65000,100,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0
3,2023,55000,100,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0
4,2023,220000,100,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0


In [253]:
df_encoded_ = pd.get_dummies(df_encoded['work_year'], prefix='year')
df_encoded_f = pd.concat([df_encoded, df_encoded_], axis=1)
df_encoded_f.head()

Unnamed: 0,work_year,salary_in_usd,remote_ratio,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_CT,employment_type_FL,employment_type_FT,...,job_title_data engineer,job_title_data scientist,job_title_machine learning engineer,company_size_L,company_size_M,company_size_S,year_2020,year_2021,year_2022,year_2023
0,2023,196000,0,0,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,1
1,2023,121000,0,0,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,1
2,2023,65000,100,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
3,2023,55000,100,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
4,2023,220000,100,0,0,0,1,0,0,1,...,0,0,1,0,1,0,0,0,0,1


In [254]:
df_encoded_f=df_encoded_f.drop('work_year',axis=1)

df_encoded_f

Unnamed: 0,salary_in_usd,remote_ratio,experience_level_EN,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_CT,employment_type_FL,employment_type_FT,employment_type_PT,...,job_title_data engineer,job_title_data scientist,job_title_machine learning engineer,company_size_L,company_size_M,company_size_S,year_2020,year_2021,year_2022,year_2023
0,196000,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
1,121000,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,0,0,1
2,65000,100,0,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,55000,100,0,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4,220000,100,0,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3372,130800,100,0,0,1,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,0
3373,45618,100,0,0,0,1,0,0,1,0,...,0,0,1,0,0,1,1,0,0,0
3374,119059,100,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
3375,28369,50,0,0,1,0,0,0,1,0,...,1,0,0,1,0,0,0,1,0,0


# Model Fitting

In [255]:

from sklearn.model_selection import train_test_split

X = df_encoded_f.drop('salary_in_usd', axis=1)
y = df_encoded_f['salary_in_usd']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

## Linear Regression

In [256]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 1914131205.0099947
Mean Absolute Error (MAE): 35183.91979547365
R-squared (R2) Score: 0.34115863333603047


## Ridge Regression

In [257]:
rig = Ridge(alpha=0.0001)
rig.fit(X_train, y_train)
y_pred = rig.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)
     

Mean Squared Error (MSE): 1914130489.3609307
Mean Absolute Error (MAE): 35183.914789180526
R-squared (R2) Score: 0.3411588796614685


## Gradient Boosting

In [258]:
from sklearn.ensemble import GradientBoostingRegressor
GBreg = GradientBoostingRegressor(n_estimators = 100, random_state = 1, learning_rate=0.1, max_depth=3)
GBreg.fit(X_train, y_train)
y_pred = GBreg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 2050348482.798056
Mean Absolute Error (MAE): 35839.16544650461
R-squared (R2) Score: 0.2942728309280064


## Kernel Ridge 

In [259]:
from sklearn.kernel_ridge import KernelRidge
KernelRidge_model = KernelRidge(alpha=0.0001)
KernelRidge_model.fit(X_train, y_train)
y_pred = KernelRidge_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)


Mean Squared Error (MSE): 1924136545.6369514
Mean Absolute Error (MAE): 35270.83986286495
R-squared (R2) Score: 0.3377148086518347


## Random Forest

In [260]:
from sklearn.ensemble import RandomForestRegressor
rand_forest_model = RandomForestRegressor()
rand_forest_model.fit(X_train, y_train)
y_pred = rand_forest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 2093708529.5242317
Mean Absolute Error (MAE): 36321.8840348104
R-squared (R2) Score: 0.2793483616079745


## XG Boost

In [261]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001]
}
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

best_xgb_model.fit(X_train, y_train)
y_pred = best_xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 2233756204.27552
Mean Absolute Error (MAE): 37163.38132569457
R-squared (R2) Score: 0.23114414175630138


## LightGBM

In [262]:
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor()
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001]
}
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_lgb_model = grid_search.best_estimator_
y_pred = best_lgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)

Mean Squared Error (MSE): 1959887392.5811155
Mean Absolute Error (MAE): 35175.853265416634
R-squared (R2) Score: 0.3254094155845061


In [265]:
model=best_lgb_model
model

In [266]:
feature_names = X_train.columns
feature_names

Index(['remote_ratio', 'experience_level_EN', 'experience_level_EX',
       'experience_level_MI', 'experience_level_SE', 'employment_type_CT',
       'employment_type_FL', 'employment_type_FT', 'employment_type_PT',
       'job_title_data analyst', 'job_title_data engineer',
       'job_title_data scientist', 'job_title_machine learning engineer',
       'company_size_L', 'company_size_M', 'company_size_S', 'year_2020',
       'year_2021', 'year_2022', 'year_2023'],
      dtype='object')

## Making Predictions

In [276]:
import joblib
import numpy as np

def make_prediction(feature_values):
    feature_names = ['remote_ratio','experience_level_EN', 'experience_level_EX', 'experience_level_MI',
                     'experience_level_SE','employment_type_CT', 'employment_type_FL', 'employment_type_FT',
                     'employment_type_PT', 'job_title_data engineer','job_title_data analyst',
                     'job_title_data scientist', 'job_title_machine learning engineer',
                     'company_size_M', 'company_size_S', 'company_size_L','year_2020', 'year_2021',
                     'year_2022', 'year_2023']

    # Create a numpy array for the input data
    input_data = np.array([[feature_values['remote_ratio'] ,
                            feature_values['experience_level'] == 'EN',
                            feature_values['experience_level'] == 'EX',
                            feature_values['experience_level'] == 'MI',
                            feature_values['experience_level'] == 'SE',
                            feature_values['employment_type'] == 'CT',
                            feature_values['employment_type'] == 'FL',
                            feature_values['employment_type'] == 'FT',
                            feature_values['employment_type'] == 'PT',
                            feature_values['job_title'] == 'data analyst',
                            feature_values['job_title'] == 'data engineer',
                            feature_values['job_title'] == 'data scientist',
                            feature_values['job_title'] == 'machine learning engineer',
                            feature_values['company_size'] == 'M',
                            feature_values['company_size'] =='S',
                            feature_values['company_size'] =='L',
                            feature_values['year'] == 2020,
                            feature_values['year'] == 2021,
                            feature_values['year'] == 2022,
                            feature_values['year'] == 2023]])
    prediction = model.predict(input_data)

    ranges = [(15000 , 48875),
              (48875 , 82750),
              (82750 , 116625),
              (116625 , 150500),
              (150500 , 184375),
              (184375 , 218250),
              (218250 , 252125),
              (252125 , 286000)]
    prediction_range = None
    for range_min, range_max in ranges:
        if range_min <= prediction < range_max:
            prediction_range = f"{range_min:,} - {range_max:,}"
            break

    return prediction_range


input_features = {
    'remote_ratio': 100,
    'experience_level': 'SE',
    'employment_type': 'FL',
    'job_title': 'data analyst',
    'year': 2023,
    'company_size': 'L'
}

prediction = make_prediction( input_features)

print("Prediction:", prediction)


Prediction: 82,750 - 116,625


In [273]:
from google.cloud import storage
import pickle

# Create a storage client
storage_client = storage.Client()

# Replace 'your-bucket-name' with the name of your Cloud Storage bucket
bucket = storage_client.bucket('cloud-ai-platform-5734b953-00ea-4818-b5b7-3420cd51a628')

# Replace 'model.pkl' with the desired name for the saved model file
blob = bucket.blob('sal_model_V2.pkl')

# Save the trained model to a local file in pickle format
with open('sal_model_V2.pkl', 'wb') as f:
    pickle.dump(model, f)

# Upload the saved model file to Cloud Storage
blob.upload_from_filename('sal_model_V2.pkl')


In [274]:
from google.cloud import storage

# Replace 'your-bucket-name' with the name of your Cloud Storage bucket
BUCKET_NAME = 'cloud-ai-platform-5734b953-00ea-4818-b5b7-3420cd51a628'

# Replace 'notebook.ipynb' with the name of your Jupyter notebook
NOTEBOOK_NAME = 'salary_pred_model.ipynb'

# Create a storage client
storage_client = storage.Client()

# Get a reference to your Cloud Storage bucket
bucket = storage_client.bucket(BUCKET_NAME)

# Get a reference to a blob in the bucket
blob = bucket.blob(NOTEBOOK_NAME)

# Upload the Jupyter notebook to Cloud Storage
blob.upload_from_filename(NOTEBOOK_NAME)
