In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'ibm-hr-analytics-attrition-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1067%2F1925%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240512%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240512T171902Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5653670513254e7cacec732608dd196410dceb7c4f80597ac934958e94ae0ef47c9bd838fa68024e306b5a2c41d14829b993ed30e696d81f1ee90e40612fdb28cd7f63b9e8ed0608a8168d2272c6b98ff90b9beb7ef955ae1da1f000764649769a30292bf6d79f35783e2226471db630b97f1125a5cecf63ae2493486f3162bbc7fcbc12b696d1f69966007bcf0bdde67a1f88c710086f7a7932ea6a32db9f19bbc8c0c8a616c006e604f5b23f1617118476a78a49c06dda47c34e83f931a8a425bfbd374c9319764b4084ebf11677f44ca1ab09a4dfdeb3e50d8b954e26e99b02375abc2c874704faf167f45c6cc4c4fe222b0ad701bb102989c9d6ade0ba1a'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Dataset Exploration

Kelompok 6 - LF01
* Abygael Adrianty Putri (2602242271)
* Axell Prita Aurelie Atmojo (2602143201)
* Madeline Andrea Sofian (2602169371)
* Ni Putu Ayu Sekar Pradnya Dewi (2602161160)

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
DATA_PATH = "/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv"

In [None]:
data = pd.read_csv(DATA_PATH)

# Data Understanding

In [None]:
data.head()

In [None]:
data.info()

The dataset has 26 numerical columns and 9 categorical columns.

In [None]:
# Describing Numerical Values
data.describe()

From the numerical description above, columns like Age, DailyRate, and Education have a balanced distribution of values. There are no significant skew (extreme outliers or very high/low data).

In [None]:
# Finding Outliers with Interquartile Range (IQR)
q1 = data['YearsAtCompany'].quantile(0.25)
q3 = data['YearsAtCompany'].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)

data[(data['YearsAtCompany'] < lower_bound) | (data['YearsAtCompany'] > upper_bound)]

However, columns with large differences between mean & median like TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, and YearsWithCurrentManager have outliers, as shown from the code above.

In [None]:
# Describing Categorical Values
data.describe(include = 'object')

From the categorical description, we can conclude points such as:
* Most of the values in Attrition is No, meaning most employees resigned
* Most employees are Sales Executives
* There are more Male employees compared to Female employees
* Most employees are Married

In [None]:
# Check for Null Data
data.isnull().sum()

This means the dataset has no null values.

# Data Preparation & Pre-Processing

**Data Cleaning**

In [None]:
# Drop Outlier Datas
data.drop(data[(data['TotalWorkingYears'] < lower_bound) | (data['TotalWorkingYears'] > upper_bound)]. index, inplace = True)
data.drop(data[(data['YearsAtCompany'] < lower_bound) | (data['YearsAtCompany'] > upper_bound)]. index, inplace = True)
data.drop(data[(data['YearsInCurrentRole'] < lower_bound) | (data['YearsInCurrentRole'] > upper_bound)]. index, inplace = True)
data.drop(data[(data['YearsSinceLastPromotion'] < lower_bound) | (data['YearsSinceLastPromotion'] > upper_bound)]. index, inplace = True)
data.drop(data[(data['YearsWithCurrManager'] < lower_bound) | (data['YearsWithCurrManager'] > upper_bound)]. index, inplace = True)

Additionally, columns 'EmployeeCount', 'Over18', 'StandardHours' contain only one unique value (all rows have the same values), which will not be useful later on. These columns will also be dropped.

In [None]:
# Drop Data with Only One Unique Value
data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

**Data Transformation**

Transforming categorical values into numerical values

In [None]:
mapping = {"Yes": 1, "No": 0}
data["Attrition"] = data["Attrition"].replace(mapping)
data["Attrition"] = data["Attrition"].astype("int64")
data["OverTime"] = data["OverTime"].replace(mapping)
data["OverTime"] = data["OverTime"].astype("int64")

In [None]:
mapping2 = {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
data["BusinessTravel"] = data["BusinessTravel"].replace(mapping2)
data["BusinessTravel"] = data["BusinessTravel"].astype("int64")

In [None]:
mapping3 = {"Research & Development" : 0, "Sales" : 1, "Human Resources" : 2}
data["Department"] = data["Department"].replace(mapping3)
data["Department"] = data["Department"].astype("int64")

In [None]:
mapping4 = {"Life Sciences": 0, "Medical": 1, "Marketing" : 2,"Technical Degree" : 3, "Human Resources" : 4, "Other" : 5  }
data["EducationField"] = data["EducationField"].replace(mapping4)
data["EducationField"] = data["EducationField"].astype("int64")

In [None]:
mapping5 = {"Male": 1, "Female": 0}
data["Gender"] = data["Gender"].replace(mapping5)
data["Gender"] = data["Gender"].astype("int64")

In [None]:
mapping6 = {"Human Resources": 8, "Manager": 7,"Healthcare Representative": 6,"Manufacturing Director": 5, "Laboratory Technician" : 4,"Sales Representative": 3, "Sales Executive": 2,"Research Director":1, "Research Scientist": 0}
data["JobRole"] = data["JobRole"].replace(mapping6)
data["JobRole"] = data["JobRole"].astype("int64")

In [None]:
mapping7 = {"Divorced": 2,"Married": 1, "Single": 0}
data["MaritalStatus"] = data["MaritalStatus"].replace(mapping7)
data["MaritalStatus"] = data["MaritalStatus"].astype("int64")

In [None]:
data.info()

# Data Visualization

1. Which columns have the top 5 highest correlations with each other?
2. What factor influences employee's rate of attrition the most?

In [None]:
data.corr()

In [None]:
# Correlation Matrix (all columns)
plt.figure(figsize=(30, 30))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap="coolwarm", annot_kws={"size":15})

**Top 5 Highest Correlations**
* JobLevel affects MonthlyIncome (0.88)
* YearsInCurrentRole affects YearsAtCompany (0.86)
* YearsAtCompany affects YearsWithCurrManager (0.85)
* PerformanceRating affects PercentSalaryHike (0.77)
* YearsInCurrentRole affects YearsWithCurrentManager (0.74)

In [None]:
# Correlations Matrix in Regards to Attrition

# Selects 10 columns that has the highest correlation to Attrition
col = data.corr().nlargest(10, "Attrition").Attrition.index

plt.figure(figsize=(15, 15))
sns.heatmap(data[col].corr(), annot=True, fmt=".2f", cmap="coolwarm", annot_kws={"size":15})

OverTime has the highest correlation with Attrition (0.27).