In [43]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [44]:
data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [45]:
data = data.rename(columns={'Age': 'age',
                       'Attrition': 'attrition',
                       'BusinessTravel': 'business_travel',
                       'DailyRate': 'daily_travel',
                       'Department': 'department',
                       'DistanceFromHome': 'distance_from_home',
                       'Education': 'education',
                       'EducationField': 'education_field',
                       'EmployeeCount': 'employee_count',
                       'EmployeeNumber': 'employee_number',
                       'EnvironmentSatisfaction': 'environment_satisfaction',
                       'Gender': 'gender',
                       'HourlyRate': 'hourly_rate',
                       'JobInvolvement': 'job_involvement',
                       'JobLevel': 'job_level',
                       'JobRole': 'job_role',
                       'JobSatisfaction': 'job_satisfaction',
                       'MaritalStatus': 'marital_status',
                       'MonthlyIncome': 'monthly_income',
                       'MonthlyRate': 'monthly_rate',
                       'NumCompaniesWorked': 'num_companies_worked',
                       'Over18': 'over_18',
                       'OverTime': 'over_time',
                       'PercentSalaryHike': 'percent_salary_hike',
                       'PerformanceRating': 'performance_rating',
                       'RelationshipSatisfaction': 'relationship_satisfaction',
                       'StandardHours': 'standard_hours',
                       'StockOptionLevel': 'stock_option_level',
                       'TotalWorkingYears': 'total_working_years',
                       'TrainingTimesLastYear': 'training_times_last_year',
                       'WorkLifeBalance': 'work_life_balance',
                       'YearsAtCompany': 'years_at_company',
                       'YearsInCurrentRole': 'years_in_current_role',
                       'YearsSinceLastPromotion': 'years_since_last_promotion',
                       'YearsWithCurrManager': 'years_with_curr_manager'})
data.head(5)


# we can drop employee_number as it has no use in the model
data = data.drop('employee_number', axis = 1)
data

# select certain columns that are the most important for the streamlit prediction
selected_columns = ['attrition', 'years_since_last_promotion', 'years_at_company', 'performance_rating', 'monthly_income', 'job_satisfaction', 'job_level', 'distance_from_home', 'age' ]
streamlit = data[selected_columns]

In [46]:
streamlit

Unnamed: 0,attrition,years_since_last_promotion,years_at_company,performance_rating,monthly_income,job_satisfaction,job_level,distance_from_home,age
0,Yes,0,6,3,5993,4,2,1,41
1,No,1,10,4,5130,2,2,8,49
2,Yes,0,0,3,2090,3,1,2,37
3,No,3,8,3,2909,3,1,3,33
4,No,2,2,3,3468,2,1,2,27
...,...,...,...,...,...,...,...,...,...
1465,No,0,5,3,2571,4,2,23,36
1466,No,1,7,3,9991,1,3,6,39
1467,No,0,6,4,6142,2,2,4,27
1468,No,0,9,3,5390,2,2,2,49


In [47]:
#split numericals and categoricals

data_cat = streamlit.select_dtypes(include = object)
data_num = streamlit.select_dtypes(include = np.number)

In [48]:
# scale numerical features

transformer = MinMaxScaler().fit(data_num)
data_num_minmax = transformer.transform(data_num) 
data_num_norm = pd.DataFrame(data_num_minmax,columns= data_num.columns)
data_num_norm.head()

Unnamed: 0,years_since_last_promotion,years_at_company,performance_rating,monthly_income,job_satisfaction,job_level,distance_from_home,age
0,0.0,0.15,0.0,0.262454,1.0,0.25,0.0,0.547619
1,0.066667,0.25,1.0,0.217009,0.333333,0.25,0.25,0.738095
2,0.0,0.0,0.0,0.056925,0.666667,0.0,0.035714,0.452381
3,0.2,0.2,0.0,0.100053,0.666667,0.0,0.071429,0.357143
4,0.133333,0.05,0.0,0.129489,0.333333,0.0,0.035714,0.214286


In [49]:
# encode the categorical feature

data_cat_dumm = pd.get_dummies(data_cat, drop_first = True)
data_cat_dumm.head()

Unnamed: 0,attrition_Yes
0,1
1,0
2,1
3,0
4,0


In [50]:
data_cat_dumm = data_cat_dumm.rename(columns = {'attrition_Yes': 'attrition'})
data_cat_dumm

Unnamed: 0,attrition
0,1
1,0
2,1
3,0
4,0
...,...
1465,0
1466,0
1467,0
1468,0


In [51]:
# concat the dataframes

concatenated_data = pd.concat([data_num_norm, data_cat_dumm], axis = 1)
concatenated_data.head()

Unnamed: 0,years_since_last_promotion,years_at_company,performance_rating,monthly_income,job_satisfaction,job_level,distance_from_home,age,attrition
0,0.0,0.15,0.0,0.262454,1.0,0.25,0.0,0.547619,1
1,0.066667,0.25,1.0,0.217009,0.333333,0.25,0.25,0.738095,0
2,0.0,0.0,0.0,0.056925,0.666667,0.0,0.035714,0.452381,1
3,0.2,0.2,0.0,0.100053,0.666667,0.0,0.071429,0.357143,0
4,0.133333,0.05,0.0,0.129489,0.333333,0.0,0.035714,0.214286,0


In [52]:
# define X and y for the model, our target is attrition

X = concatenated_data.drop('attrition', axis = 1)  
y = concatenated_data['attrition']  

In [53]:
# Export the DataFrame to CSV file
output_file = 'output_dataframe.csv'
streamlit.to_csv(output_file, index = False)