In [1]:
import warnings
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('/kaggle/input/manufacturing-defects/defects_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   defect_id          1000 non-null   int64  
 1   product_id         1000 non-null   int64  
 2   defect_type        1000 non-null   object 
 3   defect_date        1000 non-null   object 
 4   defect_location    1000 non-null   object 
 5   severity           1000 non-null   object 
 6   inspection_method  1000 non-null   object 
 7   repair_cost        1000 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 62.6+ KB


In [4]:
df.columns = ['Defect ID', 'Product ID', 'Defect Type', 'Defect Date', 'Defect Location', 'Severity', 'Inspection Method', 'Repair Cost']

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Defect ID,1000.0,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
Product ID,1000.0,50.837,29.480935,1.0,26.0,51.0,77.0,100.0
Repair Cost,1000.0,507.62715,289.623615,10.22,270.9025,506.43,759.065,999.64


In [6]:
md = pd.to_datetime(df['Defect Date'].min())
df['Days'] = (pd.to_datetime(df['Defect Date']) - md).dt.days

In [7]:
df.head()

Unnamed: 0,Defect ID,Product ID,Defect Type,Defect Date,Defect Location,Severity,Inspection Method,Repair Cost,Days
0,1,15,Structural,6/6/2024,Component,Minor,Visual Inspection,245.47,157
1,2,6,Functional,4/26/2024,Component,Minor,Visual Inspection,26.87,116
2,3,84,Structural,2/15/2024,Internal,Minor,Automated Testing,835.81,45
3,4,10,Functional,3/28/2024,Internal,Critical,Automated Testing,444.47,87
4,5,14,Cosmetic,4/26/2024,Component,Minor,Manual Testing,823.64,116


In [8]:
df.isnull().sum()

Defect ID            0
Product ID           0
Defect Type          0
Defect Date          0
Defect Location      0
Severity             0
Inspection Method    0
Repair Cost          0
Days                 0
dtype: int64

In [9]:
df.duplicated().any()

False

In [10]:
cat_features = df.select_dtypes(include='O').drop(columns='Defect Date').columns
ot_features = ['Days', 'Repair Cost']

In [11]:
for cat in list(cat_features)+['Defect Date']:
  fig = px.histogram(df, y=cat, color=cat).update_yaxes(categoryorder='total ascending')
  fig.update_layout(title={'text': f'Distribution of {cat}','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
  fig.show()

In [12]:
for ot in ot_features:
  fig = px.histogram(df, x=ot, marginal="box", hover_data=df.columns)
  fig.update_layout(title={'text': f'Distribution of {ot}','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
  fig.show()

In [13]:
d_top = df.groupby(['Product ID'], as_index=False)[['Product ID']].value_counts().nlargest(10, 'count')
fig = px.bar(d_top, x='Product ID', y='count', color='Product ID').update_xaxes(type='category', categoryorder='total ascending')
fig.update_layout(title={'text': 'Top 10 Product IDs with The Most Defects','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [14]:
d_bottom = df.groupby(['Product ID'], as_index=False)[['Product ID']].value_counts().nsmallest(10, 'count')
fig = px.bar(d_bottom, x='Product ID', y='count', color='Product ID').update_xaxes(type='category', categoryorder='total ascending')
fig.update_layout(title={'text': 'Top 10 Product IDs with The Least Defects','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [15]:
for cat in cat_features:
  fig = px.box(df, x='Repair Cost', y=cat, color=cat)
  fig.update_layout(title={'text': f'Repair Cost and {cat} Relation','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
  fig.show()

In [16]:
fig = px.scatter(df, x='Days', y='Repair Cost', marginal_x="histogram", marginal_y="rug", trendline="ols")
fig.update_layout(title={'text': f'Repair Cost and Days Relation','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()

In [17]:
df.drop(columns=['Defect ID', 'Product ID', 'Defect Date'], inplace=True)

In [18]:
encoder = LabelEncoder()
for cat in cat_features:
  df[cat] = encoder.fit_transform(df[cat])

In [19]:
df.head()

Unnamed: 0,Defect Type,Defect Location,Severity,Inspection Method,Repair Cost,Days
0,2,0,1,2,245.47,157
1,1,0,1,2,26.87,116
2,2,1,1,0,835.81,45
3,1,1,0,0,444.47,87
4,0,0,1,1,823.64,116


In [20]:
fig = px.imshow(df.corr())
fig.update_layout(title={'text': 'Correlation Between Numerical Attributes','y':0.95,'x':0.5,'xanchor': 'center','yanchor': 'top'})
fig.show()