In [1]:
pip install dash plotly

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.getcwd()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('Alpha')

In [6]:
# Removing outliers of salary

In [8]:
df= df[df['medium_salary'] <= 83000]

In [14]:
# saving the dataframe
df.to_csv('my_data.csv')

In [16]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22715 entries, 0 to 22722
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   job_id                      22715 non-null  int64  
 1   location                    22715 non-null  object 
 2   company_id                  22715 non-null  float64
 3   formatted_experience_level  22715 non-null  object 
 4   maximum_salary              19322 non-null  float64
 5   medium_salary               22715 non-null  int64  
 6   minimum_salary              19322 non-null  float64
 7   employee_count              22715 non-null  float64
 8   state                       22712 non-null  object 
 9   city                        22715 non-null  object 
 10  industry_id                 22715 non-null  float64
 11  group_industry              22715 non-null  object 
 12  job_title                   22715 non-null  object 
 13  category                    22715 no

In [20]:
df.nunique()

job_id                        22715
location                       3409
company_id                     6358
formatted_experience_level        6
maximum_salary                 3685
medium_salary                  4356
minimum_salary                 3227
employee_count                 2861
state                           322
city                           1511
industry_id                     276
group_industry                   46
job_title                     15380
category                         39
description                   21457
dtype: int64

In [22]:
# creating dataframe for decision tree regression

In [24]:
cols_to_drop = ['job_id', 'company_id', 'maximum_salary', 'minimum_salary',
                'state', 'city', 'industry_id', 'job_title', 'description',
                'location', 'employee_count',]

In [26]:
df = df.drop(columns=cols_to_drop)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22715 entries, 0 to 22722
Data columns (total 4 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   formatted_experience_level  22715 non-null  object
 1   medium_salary               22715 non-null  int64 
 2   group_industry              22715 non-null  object
 3   category                    22715 non-null  object
dtypes: int64(1), object(3)
memory usage: 887.3+ KB


In [30]:
# Ranking the experience level so it makes it easier to for the regression to work instead or random assignment

In [32]:
# Define the mapping
experience_level_mapping = {
    'Internship': 1,
    'Entry level': 2,
    'Associate': 3,
    'Mid-Senior level': 4,
    'Director': 5,
    'Executive': 6
}

# Apply the mapping
df['formatted_experience_level'] = df['formatted_experience_level'].map(experience_level_mapping)

# Verify the changes
print(df['formatted_experience_level'].head())

0    4
1    3
2    2
3    3
4    4
Name: formatted_experience_level, dtype: int64


In [34]:
#Changing them into dummy variables
df_dummies = pd.get_dummies(df, drop_first=True)

In [36]:
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22715 entries, 0 to 22722
Data columns (total 85 columns):
 #   Column                                                              Non-Null Count  Dtype
---  ------                                                              --------------  -----
 0   formatted_experience_level                                          22715 non-null  int64
 1   medium_salary                                                       22715 non-null  int64
 2   group_industry_Administrative and Support Services                  22715 non-null  bool 
 3   group_industry_Apparel & Fashion                                    22715 non-null  bool 
 4   group_industry_Automotive                                           22715 non-null  bool 
 5   group_industry_Aviation & Aerospace                                 22715 non-null  bool 
 6   group_industry_Biotechnology                                        22715 non-null  bool 
 7   group_industry_Building Materials   

In [38]:
X = df_dummies.drop(columns='medium_salary')
y = df_dummies['medium_salary']

In [40]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1000)

# Create a Decision Tree Regressor object
regressor = DecisionTreeRegressor(random_state=42)

# Train the model
regressor.fit(X_train, y_train)

# Save the fitted model
joblib.dump(regressor, 'decision_tree_regressor.pkl')

# Predict on the test data
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 13760992.476827439
R^2 Score: 0.3916856962902746


In [42]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import joblib

# Load the trained model
model = joblib.load('decision_tree_regressor.pkl')

# Initialize the Dash app
app = dash.Dash(__name__)

# Define the layout
app.layout = html.Div([
    html.H1("Salary Prediction Dashboard"),
    html.Div([
        html.Label("Experience Level"),
        dcc.Dropdown(
            id='formatted_experience_level_dropdown',
            options=[{'label': level, 'value': level} for level in df['formatted_experience_level'].unique()],
            placeholder='Select Experience Level'
        ),
        html.Label("Industry Group"),
        dcc.Dropdown(
            id='group_industry_dropdown',
            placeholder='Select Industry',
        ),
        html.Label("Category"),
        dcc.Dropdown(
            id='category_dropdown',
            placeholder='Select Category',
        ),
        html.Button('Predict Salary', id='predict_button', n_clicks=0),
        html.Div(id='output_salary')
    ], style={'width': '50%', 'display': 'inline-block'}),
    dcc.Graph(id='salary_graph')
])


In [44]:
@app.callback(
    Output('group_industry_dropdown', 'options'),
    [Input('formatted_experience_level_dropdown', 'value')]
)
def set_industry_options(selected_experience_level):
    if not selected_experience_level:
        return []
    filtered_df = df[df['formatted_experience_level'] == selected_experience_level]
    industries = filtered_df['group_industry'].unique()
    return [{'label': industry, 'value': industry} for industry in industries]

@app.callback(
    Output('category_dropdown', 'options'),
    [Input('group_industry_dropdown', 'value'),
     Input('formatted_experience_level_dropdown', 'value')]
)
def set_category_options(selected_industry, selected_experience_level):
    if not selected_experience_level or not selected_industry:
        return []
    filtered_df = df[(df['formatted_experience_level'] == selected_experience_level) & 
                     (df['group_industry'] == selected_industry)]
    categories = filtered_df['category'].unique()
    return [{'label': category, 'value': category} for category in categories]

@app.callback(
    Output('output_salary', 'children'),
    [Input('predict_button', 'n_clicks')],
    [dash.dependencies.State('formatted_experience_level_dropdown', 'value'),
     dash.dependencies.State('group_industry_dropdown', 'value'),
     dash.dependencies.State('category_dropdown', 'value')]
)
def update_output(n_clicks, experience_level, industry, category):
    if n_clicks > 0 and experience_level and industry and category:
        # Create a new DataFrame with the selected values
        input_data = pd.DataFrame([[experience_level]], columns=['formatted_experience_level'])
        input_data = pd.concat([input_data, pd.get_dummies(pd.DataFrame([[industry]], columns=['group_industry']), drop_first=True)], axis=1)
        input_data = pd.concat([input_data, pd.get_dummies(pd.DataFrame([[category]], columns=['category']), drop_first=True)], axis=1)

        # Align the columns of the input data with the training data
        input_data = input_data.reindex(columns=X.columns, fill_value=0)

        # Predict the salary using the loaded model
        predicted_salary = model.predict(input_data)[0]

        return f"Predicted Monthly Salary: ${predicted_salary:.2f}"
    return ""

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True, port=8060)
