In [5]:
import numpy as np
import pandas as pd
from pprint import pprint
import plotly.express as px
import dash_bootstrap_components as dbc
from dash import html,dcc,Input,Output,Dash,dash_table
from jupyter_dash import JupyterDash
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,multilabel_confusion_matrix,mean_squared_error
from xgboost import XGBRegressor,plot_importance,XGBClassifier
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,GradientBoostingClassifier,RandomForestClassifier

# 讀取資料集

In [6]:
df = pd.read_csv('./dataset/hr_train.csv')
df.dropna(inplace=True)
df.info()

demo_df = df.head().copy()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48660 entries, 0 to 54807
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           48660 non-null  int64  
 1   department            48660 non-null  object 
 2   region                48660 non-null  object 
 3   education             48660 non-null  object 
 4   gender                48660 non-null  object 
 5   recruitment_channel   48660 non-null  object 
 6   no_of_trainings       48660 non-null  int64  
 7   age                   48660 non-null  int64  
 8   previous_year_rating  48660 non-null  float64
 9   length_of_service     48660 non-null  int64  
 10  awards_won?           48660 non-null  int64  
 11  avg_training_score    48660 non-null  int64  
 12  is_promoted           48660 non-null  int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 5.2+ MB


# Dash配置

In [10]:
app = JupyterDash(__name__,external_stylesheets=[dbc.themes.BOOTSTRAP])

# Tab css
tabs_styles = {
    'height': '44px'
}
tab_style = {
    'borderBottom': '1px solid #d6d6d6',
    'padding': '6px',
    'fontWeight': 'bold'
}

tab_selected_style = {
    'borderTop': '1px solid #d6d6d6',
    'borderBottom': '1px solid #d6d6d6',
    'backgroundColor': '#119DFF',
    'color': 'white',
    'padding': '6px'
}

app.layout = dbc.Container([
     dbc.Row(
            [
                dbc.Col(html.H2('HR Analytics Employee Promotion Data'))
            ]
        ),
     html.Br(),
     dbc.Row(
            [
                html.H5('Dataset Demo:'),
                dbc.Col(dash_table.DataTable(demo_df.to_dict('records'),
                        [{"name": i, "id": i} for i in df.columns],
                        fixed_rows={'headers': True},
                        style_header={'backgroundColor':'#305D91','padding':'10px','color':'#FFFFFF'},
                        style_table={'overflowX': 'auto'},
                        style_cell={'overflow': 'hidden',
                                    'textOverflow': 'ellipsis',
                                    'minWidth': '250px', 'width': '250px', 'maxWidth': '250px'},
                        filter_action="native",  
                        sort_action="native",
                        fill_width=False
                        ))
            ]
        ),
     dbc.Row(
         [
         dbc.Col(
                 [
                     html.H5('Category Col:'),
                     dcc.Dropdown(id='category_dropdown_input',options=[
                     {'label': 'department', 'value': 'department'},
                     {'label': 'region', 'value': 'region'},
                     {'label': 'education', 'value': 'education'},
                     {'label': 'gender', 'value': 'gender'},
                     {'label': 'recruitment_channel', 'value': 'recruitment_channel'},
                     {'label': 'awards_won?', 'value': 'awards_won?'},
                     {'label': 'is_promoted', 'value': 'is_promoted'},
                     ]),
                 ],md=3),
         dbc.Col(
                 [  
                    html.H5('Numeric Col:'),
                    dcc.RadioItems(id='numeric_radio_input',options=[
                     # {'label': 'no_of_trainings', 'value': 'no_of_trainings'},
                     {'label': 'age', 'value': 'age'},
                     {'label': 'previous_year_rating', 'value': 'previous_year_rating'},
                     {'label': 'length_of_service', 'value': 'length_of_service'},
                     {'label': 'avg_training_score', 'value': 'avg_training_score'},
                     ])
                 ],md=9)
             
         ]),
     html.Br(),
     dcc.Tabs(id="tabs_input", value='tab-1', children=[
        dcc.Tab(label='Box Plot', value='tab-1', style=tab_style, selected_style=tab_selected_style),
        dcc.Tab(label='Histogram', value='tab-2', style=tab_style, selected_style=tab_selected_style),
        dcc.Tab(label='Parallel Categories Diagram', value='tab-3', style=tab_style, selected_style=tab_selected_style),
     ], style=tabs_styles),
     dbc.Row(
             [
                 dbc.Col(dcc.Graph(id='tabs_output'))
             ]
     ),
     dbc.Row()
],style={'background-color':'#F5F5F5'})


@app.callback(Output('tabs_output', 'figure'),
              Input('tabs_input', 'value'),
              Input('category_dropdown_input','value'),
              Input('numeric_radio_input','value'))

def render_tabs_content(tab,category,numeric):
    if tab == 'tab-1':
        fig = px.box(df, x=category,y=numeric,color=category)
        fig.update_layout(paper_bgcolor='rgb(243, 243, 243)')
        fig.update_layout(plot_bgcolor='rgb(243, 243, 243)')
        return fig
    elif tab == 'tab-2':
        fig = px.histogram(df,x=numeric,histnorm='probability')
        fig.update_layout(paper_bgcolor='rgb(243, 243, 243)')
        fig.update_layout(plot_bgcolor='rgb(243, 243, 243)')
        return fig
    elif tab == 'tab-3':
        fig = px.parallel_categories(df,['education','gender','department'],color='previous_year_rating')
        fig.update_layout(paper_bgcolor='rgb(243, 243, 243)')
        fig.update_layout(plot_bgcolor='rgb(243, 243, 243)')
        return fig
    


if __name__ == '__main__':
    # 'external' or 'inline'
    app.run_server(mode="external",debug=True,port=8000)

In [3]:
# df.groupby(by=["gender",'recruitment_channel']).size().reset_index(name="counts")

In [100]:
# px.bar(df.groupby(by=["gender",'education']).size().reset_index(name="counts"),x='gender',y='counts',color='education',barmode='group')

In [11]:
# df['awards_won?'].value_counts()

In [68]:
# px.parallel_categories(df,['education','gender','department'],color='previous_year_rating')

In [14]:
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73,0


In [79]:
ml_df.head()

Unnamed: 0,avg_training_score,department_Analytics,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_R&D,department_Sales & Marketing,department_Technology,education_Bachelor's,education_Below Secondary,education_Master's & above,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,49,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1
1,60,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0
2,50,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1
3,50,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0
4,73,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0


In [3]:
from sklearn.model_selection import GridSearchCV
ml_df = df.drop(['employee_id'],axis=1)
ml_df = pd.get_dummies(ml_df)
X = ml_df.drop(["avg_training_score"],axis=1)
y = ml_df["avg_training_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2022)

# def evaluate(model, test_features, test_labels):
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'.format(accuracy))
#     return accuracy

base_model = RandomForestRegressor(n_estimators = 10, random_state = 2022)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)
importance_index = base_model.feature_importances_.argsort()
pprint(ml_df.columns[importance_index])


NameError: name 'df' is not defined

In [82]:
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }

# rf = RandomForestRegressor(random_state=2022)
# grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
#                           cv = 3, n_jobs = -1, verbose = 1)

# grid_search.fit(X_train, y_train)
# pprint(grid_search.best_params_)

# best_grid = grid_search.best_estimator_
# grid_accuracy = evaluate(best_grid, X_test, y_test)
# pprint('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


In [39]:
# base_model=RandomForestRegressor(bootstrap=True,max_depth=80,
#                       min_samples_leaf=3,min_samples_split=8,n_estimators=200)

# base_model.fit(X_train, y_train)
# base_accuracy = evaluate(base_model, X_test, y_test)