## Knowing your dataset - Pandas Python Library


- Data types
- Size of dataset
- Statistical Summary
- Unique values
- Conditional filtering 
- Dropping/renaming columns


In [1]:
import pandas as pd

Let's load the data

In [4]:
currentEmp = pd.read_excel('employeeAttrition.xlsx', sheet_name='CurrentEmployees')
churnedEmp = pd.read_excel('employeeAttrition.xlsx', sheet_name='ChurnedEmployees')

In [64]:
currentEmp.head()

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,2001,0.58,0.74,4,215,3,0,0,sales,low
1,2002,0.82,0.67,2,202,3,0,0,sales,low
2,2003,0.45,0.69,5,193,3,0,0,sales,low
3,2004,0.78,0.82,5,247,3,0,0,sales,low
4,2005,0.49,0.6,3,214,2,0,0,sales,low


In [5]:
churnedEmp.head(10)

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,1,0.38,0.53,2,157,3,0,0,sales,low
1,2,0.8,0.86,5,262,6,0,0,sales,medium
2,3,0.11,0.88,7,272,4,0,0,sales,medium
3,4,0.72,0.87,5,223,5,0,0,sales,low
4,5,0.37,0.52,2,159,3,0,0,sales,low
5,6,0.41,0.5,2,153,3,0,0,sales,low
6,7,0.1,0.77,6,247,4,0,0,sales,low
7,8,0.92,0.85,5,259,5,0,0,sales,low
8,9,0.89,1.0,5,224,5,0,0,sales,low
9,10,0.42,0.53,2,142,3,0,0,sales,low


## Data types of the datasets

In [6]:
churnedEmp.dtypes

Emp ID                     int64
satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
promotion_last_5years      int64
dept                      object
salary                    object
dtype: object

In [7]:
currentEmp.dtypes

Emp ID                     int64
satisfaction_level       float64
last_evaluation          float64
number_project             int64
average_montly_hours       int64
time_spend_company         int64
Work_accident              int64
promotion_last_5years      int64
dept                      object
salary                    object
dtype: object

## Size of Dataset

In [8]:
currentEmp.shape

(11428, 10)

In [9]:
churnedEmp.shape

(3571, 10)

## Statistical Summary

In [10]:
churnedEmp.describe()

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
count,3571.0,3571.0,3571.0,3571.0,3571.0,3571.0,3571.0,3571.0
mean,6500.439653,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321
std,6266.484705,0.263933,0.197673,1.818165,61.202825,0.977698,0.212364,0.072759
min,1.0,0.09,0.45,2.0,126.0,2.0,0.0,0.0
25%,893.5,0.13,0.52,2.0,146.0,3.0,0.0,0.0
50%,1786.0,0.41,0.79,4.0,224.0,4.0,0.0,0.0
75%,12678.5,0.73,0.9,6.0,262.0,5.0,0.0,0.0
max,14999.0,0.92,1.0,7.0,310.0,6.0,1.0,1.0


In [11]:
currentEmp.describe()

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
count,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0,11428.0
mean,7812.340742,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
std,3453.947461,0.217104,0.162005,0.979884,45.682731,1.562348,0.379991,0.159889
min,2001.0,0.12,0.36,2.0,96.0,2.0,0.0,0.0
25%,4857.75,0.54,0.58,3.0,162.0,2.0,0.0,0.0
50%,7714.5,0.69,0.71,4.0,198.0,3.0,0.0,0.0
75%,10571.25,0.84,0.85,4.0,238.0,4.0,0.0,0.0
max,14211.0,1.0,1.0,6.0,287.0,10.0,1.0,1.0


## Unique Values

In [12]:
churnedEmp.nunique()

Emp ID                   3571
satisfaction_level         81
last_evaluation            54
number_project              6
average_montly_hours      164
time_spend_company          5
Work_accident               2
promotion_last_5years       2
dept                       10
salary                      3
dtype: int64

In [13]:
churnedEmp['last_evaluation'].nunique()

54

In [14]:
currentEmp.nunique()

Emp ID                   11428
satisfaction_level          89
last_evaluation             65
number_project               5
average_montly_hours       192
time_spend_company           8
Work_accident                2
promotion_last_5years        2
dept                        10
salary                       3
dtype: int64

## Concatenate Datasets

In [15]:
# Create an additional column, 'churned' for both datasets
churnedEmp['churned'] = [1] * churnedEmp.shape[0]
currentEmp['churned'] = [0] * currentEmp.shape[0]

In [17]:
dataset= currentEmp.append(churnedEmp, ignore_index=True, sort=False)
dataset

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary,churned
0,2001,0.58,0.74,4,215,3,0,0,sales,low,0
1,2002,0.82,0.67,2,202,3,0,0,sales,low,0
2,2003,0.45,0.69,5,193,3,0,0,sales,low,0
3,2004,0.78,0.82,5,247,3,0,0,sales,low,0
4,2005,0.49,0.60,3,214,2,0,0,sales,low,0
...,...,...,...,...,...,...,...,...,...,...,...
14994,14995,0.40,0.57,2,151,3,0,0,support,low,1
14995,14996,0.37,0.48,2,160,3,0,0,support,low,1
14996,14997,0.37,0.53,2,143,3,0,0,support,low,1
14997,14998,0.11,0.96,6,280,4,0,0,support,low,1


## Changing data type

In [18]:
# changing the data type of time_spend_company and number_project to object 
# so as to help us to make a bar plot.
dataset = dataset.astype({'time_spend_company':'object', 'number_project':'object'})

In [19]:
dataset.dtypes

Emp ID                     int64
satisfaction_level       float64
last_evaluation          float64
number_project            object
average_montly_hours       int64
time_spend_company        object
Work_accident              int64
promotion_last_5years      int64
dept                      object
salary                    object
churned                    int64
dtype: object

## Conditional Filtering

In [20]:
# Select rows in dataset for which 'satisfaction_level' column contains values strictly greater
# than 0.6

dataset[dataset['satisfaction_level'] < 0.6]

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary,churned
0,2001,0.58,0.74,4,215,3,0,0,sales,low,0
2,2003,0.45,0.69,5,193,3,0,0,sales,low,0
4,2005,0.49,0.60,3,214,2,0,0,sales,low,0
5,2006,0.36,0.95,3,206,4,0,0,sales,low,0
6,2007,0.54,0.37,2,176,2,0,0,sales,low,0
...,...,...,...,...,...,...,...,...,...,...,...
14994,14995,0.40,0.57,2,151,3,0,0,support,low,1
14995,14996,0.37,0.48,2,160,3,0,0,support,low,1
14996,14997,0.37,0.53,2,143,3,0,0,support,low,1
14997,14998,0.11,0.96,6,280,4,0,0,support,low,1


In [75]:
# Select rows in dataset for which 'number_project' column contains values exactly equal to 5

dataset[dataset['number_project'] == 5]

Unnamed: 0,Emp ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary,churned
2,2003,0.45,0.69,5,193,3,0,0,sales,low,0
3,2004,0.78,0.82,5,247,3,0,0,sales,low,0
7,2008,0.99,0.91,5,136,4,0,0,sales,low,0
12,2013,0.48,0.94,5,255,6,0,0,accounting,medium,0
20,2021,0.79,0.97,5,266,2,0,0,technical,medium,0
...,...,...,...,...,...,...,...,...,...,...,...
14981,14982,0.73,0.93,5,162,4,0,0,technical,low,1
14983,14984,0.72,0.84,5,257,5,0,0,technical,medium,1
14985,14986,0.91,0.99,5,254,5,0,0,technical,medium,1
14987,14988,0.90,0.70,5,206,4,0,0,technical,low,1


## Dropping/renaming columns

In [21]:
# pandas drop columns using list of column names

Data_reduced = dataset.drop(['Emp ID'], axis=1)

In [79]:
Data_reduced.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary,churned
0,0.58,0.74,4,215,3,0,0,sales,low,0
1,0.82,0.67,2,202,3,0,0,sales,low,0
2,0.45,0.69,5,193,3,0,0,sales,low,0
3,0.78,0.82,5,247,3,0,0,sales,low,0
4,0.49,0.6,3,214,2,0,0,sales,low,0


In [78]:
Data_reduced.shape

(14999, 10)

## Renaming Columns

In [22]:
#  df = df.rename(columns = {'old column name':'new column name'})

dataset.rename(columns = {'Emp ID' : 'ID'})

Unnamed: 0,ID,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary,churned
0,2001,0.58,0.74,4,215,3,0,0,sales,low,0
1,2002,0.82,0.67,2,202,3,0,0,sales,low,0
2,2003,0.45,0.69,5,193,3,0,0,sales,low,0
3,2004,0.78,0.82,5,247,3,0,0,sales,low,0
4,2005,0.49,0.60,3,214,2,0,0,sales,low,0
...,...,...,...,...,...,...,...,...,...,...,...
14994,14995,0.40,0.57,2,151,3,0,0,support,low,1
14995,14996,0.37,0.48,2,160,3,0,0,support,low,1
14996,14997,0.37,0.53,2,143,3,0,0,support,low,1
14997,14998,0.11,0.96,6,280,4,0,0,support,low,1


## Reference

https://thispointer.com//