In [1]:
#importing libraries
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import plotly_express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from plotly.offline import plot, iplot,init_notebook_mode

In [2]:
data = pd.read_csv('')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '../input/productivity-prediction-of-garment-employees/garments_worker_productivity.csv'

In [None]:
df = data.copy()

In [None]:
df.shape

In [None]:
df.info()

# Data Description
1. The dataset contains 1197 rows and 15 columns
 
Attribute Information:

1. date : Date in MM-DD-YYYY
2. day : Day of the Week
3. quarter : A portion of the month. A month was divided into four quarters
4. department : Associated department with the instance
5. teamno : Associated team number with the instance
6. noofworkers : Number of workers in each team 
7. noofstylechange : Number of changes in the style of a particular product
8. targetedproductivity : Targeted productivity set by the Authority for each team for each day.
9. smv : Standard Minute Value, it is the allocated time for a task 
10. wip : Work in progress. Includes the number of unfinished items for products 
11. overtime : Represents the amount of overtime by each team in minutes
12. incentive : Represents the amount of financial incentive (in BDT) that enables or motivates a particular course of action.
13. idletime : The amount of time when the production was interrupted due to several reasons 
14. idlemen : The number of workers who were idle due to production interruption
15. actual_productivity : The actual % of productivity that was delivered by the workers. It ranges from 0-1.

Date,quarter, department and day are object datatypes and rest are int or float types

In [None]:
df.isna().sum()

# Data Cleaning - Feature engineering

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['month_name'] = df['date'].dt.month_name() #create month name

In [None]:
df['overtime_in_hours'] = df['over_time'].apply(lambda x: x/60) #create a new feature which has overtime measured in hours

In [None]:
df['quarter'].value_counts()

In [None]:
df['department'].value_counts().index.to_list() #We can see there is an extra space, which led to 3 categories, we'll fix it and also fix the spelling of sewing

In [None]:
df['department'] = df['department'].apply(lambda x: 'finishing' if x == ('finishing ' or 'finishing' ) else 'sewing' )

In [None]:
df['department'].value_counts().index.to_list()

In [None]:
df['day'].value_counts() #Friday is not a working day

In [None]:
dept = df.department.value_counts().reset_index()
dept.rename(columns = {'index':'department', 'department':'total_num'},inplace=True)
dept

# Univariate analysis of categorical variables

In [None]:
fig = go.Figure(data=[go.Pie(labels=dept.department.to_list(),
                             values=dept.total_num.to_list())])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=['green', 'yellow'], line=dict(color='#000000', width=2)))
fig.show()

In [None]:
qdf = df['quarter'].value_counts().reset_index()
qdf.rename(columns={'index': 'quarters', 'quarter': 'days_in_each_quarter'}, inplace=True)
fig = go.Figure(data=[go.Pie(labels=qdf.quarters.to_list(), values=qdf.days_in_each_quarter.to_list(), pull=[0, 0, 0, 0, 0.4], hole=.3)])
fig.show()

# Univariate analysis of continuous variables

**Histogram is representation of the distribution of numerical data**

In [None]:
fig = px.histogram(df, x="targeted_productivity", nbins=20, template='plotly_dark')
fig.update_layout(title='Distribution of Targeted productivity', title_x=0.5)
fig.show()

**A violin plot is a method of plotting numeric data. It is similar to a box plot, with the addition of a rotated kernel density plot on each side**

In [None]:
fig = px.violin(df, y="actual_productivity", box=True, points='all')
fig.update_layout(title='Distribution of actual_productivity', title_x=0.5)
fig.show()

In [None]:
fig = px.histogram(df, x="smv", 
                   marginal="violin", template='ggplot2' # or violin, rug
                  )
fig.update_layout(title='Distribution of smv(standard minute value)', title_x=0.5)

fig.show()

In [None]:
fig = make_subplots(rows=1, cols=2)
y1= df['no_of_workers']
y2=df['incentive']
x=df['month_name']

fig.add_trace(go.Bar( x=x,y=y1), row=1, col=1)
fig.add_trace(go.Bar(x=x, y=y2), row=1, col=2)

fig.update_layout(title='Plots of number of workers and incentive paid in each month', title_x=0.5)

fig.show()

# Although less number of people worked in March, high incentive was paid, which means they are working overtime.

In [None]:
fig = px.scatter(df, x="date", y="idle_time",color="team", template="plotly_dark")
fig.update_layout(title='Idle time spent by Teams', title_x=0.5)
fig.show()

# Team 8 and 7 spent most idle time

In [None]:
fig = px.scatter(df, y="idle_men", x="date", color="idle_men", facet_col="team", 
                 title="Number of Idle_men in Teams", template="plotly_dark")
fig.update_layout(title='Count of Idle men in Teams', title_x=0.5)
fig.update_xaxes(showgrid=False)

fig.show()

# Bivariate Analysis

# Actual productivity w.r.t departments

In [None]:
fig = px.violin(df, y="actual_productivity", box=True, points='all', color='department')
fig.show()

In [None]:
fig = px.scatter(df, y="actual_productivity",x="targeted_productivity", color='department'
             )
fig.update_layout(title_text='Actual vs Targeted Productivity',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="rosybrown"
    ))

fig.show()

Both departments are exceeding targets

In [None]:
fig = px.bar(df, x="department", y="actual_productivity", barmode="group", color="department",
             facet_col="quarter")
fig.update_layout(title_text='Productivity of departments in each quarter',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="maroon"
    ))

fig.show()

# Productivity in each month

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    y=df['month_name'].to_list(),
    x=df['no_of_workers'].to_list(),
    name='Sewing',
    orientation='h',
    marker=dict(
        color='rgba(246, 78, 139, 0.6)',
        line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
    )
))
fig.add_trace(go.Bar(
    y=df['month_name'].to_list(),
    x=df['no_of_workers'].to_list(),
    name='Finishing',
    orientation='h',
    marker=dict(
        color='rgba(58, 71, 80, 0.6)',
        line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
    )
))

fig.update_layout(barmode='stack')
fig.show()

In [None]:
daydf = df[['day', 'actual_productivity']].groupby('day').mean()
daydf = daydf.sort_values('actual_productivity', ascending=False).reset_index()
fig = px.bar(daydf, x='day', y='actual_productivity',
             hover_data=['day', 'actual_productivity'], color='actual_productivity',
             labels={'actual_productivity':'Productivity'}, height=500)
fig.show()

**Surprisingly Saturday is the most productive with an average of 0.75, maybe because Friday is holiday which makes sense, after a break your productivity is more**

In [None]:
teamdf = df[['team', 'actual_productivity']].groupby('team').mean()
teamdf = teamdf.sort_values('actual_productivity', ascending=False).reset_index()
teamdf
fig = px.bar(teamdf, x="team", y="actual_productivity", color='actual_productivity')
fig.update_layout(title_text='Productivity by Teams',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="green"
    ))
fig.show()

# Team 1 is the most productive followed by Team 3

# Number of workers in each Team

In [None]:
fig = px.bar(df, x="team", y="no_of_workers", barmode="group", color='department'
             )
fig.update_layout(title_text='Team size',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="red"
    ))

fig.show()  

# Incentives paid to different teams

In [None]:
idf = df[['team', 'incentive']].groupby('team').mean().sort_values('incentive',ascending=False).reset_index()
fig = px.bar(idf, x="team", y="incentive", barmode="group"
             )
fig.update_layout(title_text='Incentive paid to each team',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="green"
    ))

fig.show()  

**Team 9 gets the highest incentive on avg, let us check the box plots to see if any outliers exist**

In [None]:
fig = px.box(df, x="team", y="incentive")
fig.update_layout(title_text='Incentive paid to each team - boxplot',title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=10,
        color="red"
    ))
fig.show()

**Because of the outlier, team 9 avg incentive value is very high****

# Only sewing department gets incentive

In [None]:
fig = px.box(df, x="department", y="incentive")
fig.show()

# Over-time vs Teams, department

In [None]:
fig = px.box(df, x="department", y="overtime_in_hours")

fig.update_layout(title_text='Overtime spent(in hours) vs Departments',xaxis_title = 'Department',yaxis_title='Overtime in minutes', title_x=0.5,font=dict(
        family="Courier New, monospace",
        size=18,
        color="mediumvioletred"
    ))

fig.show()

**the median value of financing department working overtime is 1 day , it is not paid incentive, while sewing department works 4 days as overtime**

# Work in progress in departments, teams, quarters

In [None]:
fig = px.box(df, x="department", y="wip")
fig.show()

## Work in progress in teams

In [None]:
fig = px.box(df, x="team", y="wip")
fig.show()

# Work in progress in teams

In [None]:
fig = px.box(df, x="quarter", y="wip")
fig.show()

**Notebook is under construction .. Do Appreciate if you find it useful and checkout this dataset to create visualisations and submitting tasks**
**Thank you**