In [3]:
# Data wrangling
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
# Off FutureWarnings
import warnings 
warnings.filterwarnings('ignore')
#Resampling
from imblearn.over_sampling import SMOTENC 
from sklearn.utils import class_weight
#Dimension Reduction
from sklearn.decomposition import PCA
# Preprocessing
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder, OneHotEncoder 
from sklearn.preprocessing import LabelEncoder
# Models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
# Models Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
# Model evaluation
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import confusion_matrix
# Save model
import pickle

ModuleNotFoundError: No module named 'matplotlib'

### Section 2

In [None]:
## Upload df
df = pd.read_csv('data.csv') 
pd.options.mode.chained_assignment = None
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [None]:
# Verify if the data has duplicate values 
data_duplicates = df.duplicated().sum() 
print("Total duplicated values: ", data_duplicates)


Total duplicated values:  0


In [None]:
# Verify if the data has missing values 
data_null = df.isnull().sum().sum() 
print("Total missing values: ", data_null)


Total missing values:  0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [None]:
## Numeric features analysis 
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,162.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# Cannot do 2.7 because there is no target variable

In [None]:
## Verify the proportion of column "Failure_type" 
df['Machine failure'].value_counts()

Machine failure
0    9661
1     339
Name: count, dtype: int64

### Section 3

In [None]:
rename_columns = { 'UDI':'ID',
'Air temperature [K]': 'Air_temperature',
'Process temperature [K]' : 'Process_temperature', 
'Rotational speed [rpm]' : 'Rotational_speed', 
'Torque [Nm]' : 'Torque',
'Tool wear [min]' : 'Tool_wear',
'Machine failure' : 'Machine_failure'
}
df.rename(rename_columns, axis = 1, inplace = True)

In [None]:
# Analysing redundance value, count failures on "Target" column
# count_failures = (df['Target'] == 1).sum()
# print('Number of failures:', count_failures)
# # Sum of occurrences of 'Failure_type' other than 'No Failure'
# sum_failures = df.loc[(df['Failure_type'] != 'No Failure') & (df['Target'] == 1), 'Failure_type'].count()
# print('Sum of failures different of "No Failure":', sum_failures)

## Have to modify this


In [None]:
# Filter the rows with has 1 in the column "Target" and 'No Failure' in the column 'Failure_type' rows_to_remove = df[(df['Target'] == 1) & (df['Failure_type'] == 'No Failure')]
# Remove this filtered row in the main df
# df.drop(rows_to_remove.index, inplace=True)

# Have to modify this

In [None]:
df.columns

Index(['ID', 'Product ID', 'Type', 'Air_temperature', 'Process_temperature',
       'Rotational_speed', 'Torque', 'Tool_wear', 'Machine_failure', 'TWF',
       'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [None]:
#Removing unnecessary columnds
drop_columns = ["ID", "Product ID"] 
df.drop(drop_columns, axis = 1, inplace = True)

In [1]:
#mihir is gay