# Data Salary Prediction Model

## Package Imports

In [1]:
import pandas as pd
import tensorflow_decision_forests as tfdf

## Prepare Dataset

In [2]:
dataset = pd.read_csv('jobs_in_data.csv')

### Filter Fields

In [3]:
df = dataset[dataset['employee_residence'] == 'United States'].loc[:, ~dataset.columns.isin(['salary_currency', 'salary','job_category'])]
print(df)

      work_year                 job_title  salary_in_usd employee_residence  \
1          2023            Data Architect         186000      United States   
2          2023            Data Architect          81800      United States   
3          2023            Data Scientist         212000      United States   
4          2023            Data Scientist          93300      United States   
5          2023            Data Scientist         130000      United States   
...         ...                       ...            ...                ...   
9350       2021           Data Specialist         165000      United States   
9351       2020            Data Scientist         412000      United States   
9352       2021  Principal Data Scientist         151000      United States   
9353       2020            Data Scientist         105000      United States   
9354       2020     Business Data Analyst         100000      United States   

     experience_level employment_type work_setting 

### Test the Dataset

In [4]:
jobTypeMean = df.groupby("experience_level")["salary_in_usd"].mean().round(2)
print(jobTypeMean)

experience_level
Entry-level    104637.16
Executive      195731.13
Mid-level      130431.59
Senior         166284.88
Name: salary_in_usd, dtype: float64


### Split Dataset into Train and Test Dataframes

In [5]:
size = round(df.shape[0]/2)
train_df = df.iloc[:size,:]
test_df = df.iloc[size:,:]

### Covert to TensorFlow Datasets

In [6]:
label = 'salary_in_usd'
# Will assign each label to an integer value and convert the dataframes to TensorFlow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label, task = tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label, task = tfdf.keras.Task.REGRESSION)

## Train and Fit the Model

### Using Random Forest Regression Model

In [7]:
model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
model.fit(train_ds)

Use /var/folders/3z/2gcw_n2549qdw21w2lvvh6hh0000gn/T/tmpz7usxbk7 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:01.394581. Found 4043 examples.
Training model...
Model trained in 0:00:00.213397
Compiling model...


[INFO 24-06-15 21:57:54.6956 EDT kernel.cc:1233] Loading model from path /var/folders/3z/2gcw_n2549qdw21w2lvvh6hh0000gn/T/tmpz7usxbk7/model/ with prefix 87818e5e2ec64d3b
[INFO 24-06-15 21:57:54.7793 EDT decision_forest.cc:734] Model loaded with 300 root(s), 64490 node(s), and 6 input feature(s).
[INFO 24-06-15 21:57:54.7793 EDT abstract_model.cc:1344] Engine "RandomForestGeneric" built
[INFO 24-06-15 21:57:54.7793 EDT kernel.cc:1061] Use fast generic engine


Model compiled.


<tf_keras.src.callbacks.History at 0x1771aa2d0>

### Model Summary

In [8]:
model.summary()

Model: "random_forest_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "RANDOM_FOREST"
Task: REGRESSION
Label: "__LABEL"

Input Features (8):
	company_location
	company_size
	employee_residence
	employment_type
	experience_level
	job_title
	work_setting
	work_year

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.        "job_title"  0.978026 ################
    2. "experience_level"  0.318301 ###
    3.     "work_setting"  0.157241 
    4.     "company_size"  0.138268 
    5.  "employment_type"  0.115738 
    6.        "work_year"  0.115678 

Variable Importance: NUM_AS_ROOT:
    1.        "job_title" 294.000000 ################
    2. "experience_level"  6.000000 

Variable Importance: NUM_NODES:
    1.        "job_titl

## Compute Model Accuracy

In [9]:
model.compile(metrics=['MAPE'])
model.evaluate(train_ds, return_dict=True)
model.evaluate(test_ds, return_dict=True)



{'loss': 0.0, 'MAPE': 28.81723403930664}

## Save the Model

In [10]:
model.save('models/')

INFO:tensorflow:Assets written to: models/assets


INFO:tensorflow:Assets written to: models/assets
