In [2]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import IsolationForest    
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline

In [2]:
!pip install -r r'C:\Users\julie.ngan\OneDrive - Efrei\M2\pj_transverse\Automated-PdM\4Sight\requirements.txt' 
!pip install pandas
!pip install matplotlib
!pip install scikit-learn       

ERROR: Invalid requirement: '-'


Collecting pandas
  Downloading pandas-1.5.2-cp38-cp38-win_amd64.whl (11.0 MB)
     --------------------------------------- 11.0/11.0 MB 26.2 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Using cached pytz-2022.6-py2.py3-none-any.whl (498 kB)
Collecting numpy>=1.20.3
  Downloading numpy-1.23.5-cp38-cp38-win_amd64.whl (14.7 MB)
     --------------------------------------- 14.7/14.7 MB 25.1 MB/s eta 0:00:00
Installing collected packages: pytz, numpy, pandas
Successfully installed numpy-1.23.5 pandas-1.5.2 pytz-2022.6
Collecting matplotlib
  Using cached matplotlib-3.6.2-cp38-cp38-win_amd64.whl (7.2 MB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting pillow>=6.2.0
  Using cached Pillow-9.3.0-cp38-cp38-win_amd64.whl (2.5 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.0.6-cp38-cp38-win_amd64.whl (163 kB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.38.0-py3-none-any.whl (965 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisol

## Data Preparation

In [3]:
def read_data(path):  #? date_col is a list

    data = pd.read_csv(path,
                            infer_datetime_format=True,
                            on_bad_lines='warn',
                            skip_blank_lines=True)

    try:
        df = data.sort_index()
    except:
        print("Unexpected error:", sys.exc_info()[0])
    print('\n', df.dtypes)
    return df

In [4]:
def nan(df):
    print("Process Nan...")
    df_numeric = df.select_dtypes(include=[np.number])
    numeric_cols = df_numeric.columns.values
    for col in numeric_cols:
        pct_missing = np.mean(df[col].isnull())
        print('{} - {}%'.format(col, round(pct_missing*100)))
        if (pct_missing < 4):                                             #* if NaN < 4% : replace by median
            med = df[col].median()
            df[col] = df[col].fillna(med)
        if pct_missing >= 20:                                           #* if NaN > 20% : drop features
            df = df.drop(columns=[col])
        if (pct_missing < 20) & (pct_missing >= 4) :                    #* if NaN < 20% & > 4% : drop lines
            df = df.dropna(subset=[col])

    df_non_numeric = df.select_dtypes(exclude=[np.number])              #* Repeat process with non numerics variables
    non_numeric_cols = df_non_numeric.columns.values
    for col in non_numeric_cols:
        pct_missing = np.mean(df[col].isnull())
        print('{} - {}%'.format(col, round(pct_missing*100)))
        if pct_missing < 4:
            med = df[col].median()
            df[col] = df[col].fillna(med)
        if pct_missing >= 20:
            df = df.drop(columns=[col])
        if pct_missing < 20 :
            df = df.dropna(subset=[col])
    print(df.shape)
    return df

def fix_typos(df):
    print("Fixing Typos...")
    obj = [col  for col, dt in df.dtypes.items() if dt == object]
    for col in obj:
        df[obj] = df[obj].str.replace(',', '.')
        df[obj] = df[obj].str.upper()
        df[obj] = df[obj].str.strip()
    print(df.shape)
    return df

def multiple_format(df, mult_var=None):                                 #* mult_var is a list
    print("Encoding categorical varible(s)...")
    if mult_var is not None:
        df = pd.get_dummies(data=df, columns=mult_var)
    print(df.shape)
    return df

def normalization(df):
    scaler = MinMaxScaler()
    scaler.fit_transform(df)
    return df

def suppressOutliers(df):
    clf = IsolationForest(random_state=42)
    param_grid = {'n_estimators': list(range(100, 1000, 10)), 
                'contamination': [0.005, 0.01, 0.02, 0.03, 0.05, 0.06, 0.07, 0.08], 
                'bootstrap': [True, False]}        

    grid_isol = RandomizedSearchCV(clf, 
                                    param_grid,
                                    scoring=custom_silhouette,              #? Davies Bouldin Score     or      Silhouette Score  
                                    refit=True,
                                    cv=3, 
                                    return_train_score=True)
    best_model = grid_isol.fit(df.values)
    custom_silhouette(best_model, df.values)
    custom_DBScrore(best_model, df.values)
    print('Optimum parameters', best_model.best_params_)
    y_pred = best_model.predict(df.values)
    train_clustered = df.assign(Cluster=y_pred)
    train_clustered = train_clustered.replace({-1: "Anomaly", 1: "Regular"})
    train_clustered["Cluster"].value_counts()


    """to_model_columns= train.columns[2:8]
    clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.12), max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    clf.fit(df[to_model_columns])
    pred = clf.predict(df[to_model_columns])
    df['anomaly']=pred
    outliers=df.loc[df['anomaly']==-1]
    outlier_index=list(outliers.index)
    #print(outlier_index)
    #Find the number of anomalies and normal points here points classified -1 are anomalous
    print(df['anomaly'].value_counts())"""

    #df = df[df['anomaly'] == 1]

    # TO DO return value

def custom_silhouette(estimator, X):
      print("{}   -     ".format(round(silhouette_score(X, estimator.predict(X)), 4)), end = '')
      return np.mean(silhouette_score(X, estimator.predict(X)))

def custom_DBScrore(estimator, X):
      print(round(davies_bouldin_score(X, estimator.predict(X)), 4))
      return np.mean(davies_bouldin_score(X, estimator.predict(X)))


In [5]:
def data_prep(df, mult_var=None):
    #df = df.drop_duplicates(keep='last')            #* Keep only most recent duplicatas
    #df = fix_typos(df)                              #* Set a good typos for categorical features
    print(df)
    df = pd.get_dummies(data=df, columns=mult_var)
    #df = multiple_format(df, mult_var=None)         #* Encode categorical variables
    df = nan(df)                                    #* Process empty values based on several conditions
    df = normalization(df) 
    #df = suppressOutliers(df)
    df = df.convert_dtypes()                        #* Assign good type for the modelling phase
    df = df.select_dtypes(exclude=['object'])       #* Remove Object and String columns who are irrelevant
    #df = df.convert_dtypes()                        #* Assign good type for the modelling phase
                                                    # TODO: Verify order of functions and add Outliers Removal
    #print('\n', df.dtypes)
    return df

In [6]:
from asyncore import read
path = r'../data/raw/Classification/predictive_maintenance.csv'

df = read_data(path).drop(columns=["Product ID", "Failure Type"])
mult_var = ["Type"] #, "Failure Type"]
data_cleaned = data_prep(df,mult_var).set_index('UDI')

pd.set_option('display.max_columns', None)
data_cleaned


 UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Target                       int64
Failure Type                object
dtype: object
        UDI Type  Air temperature [K]  Process temperature [K]  \
0         1    M                298.1                    308.6   
1         2    L                298.2                    308.7   
2         3    L                298.1                    308.5   
3         4    L                298.2                    308.6   
4         5    L                298.2                    308.7   
...     ...  ...                  ...                      ...   
9995   9996    M                298.8                    308.4   
9996   9997    H                298.9                    308.4   
9997   9998    M                299.0   

Unnamed: 0_level_0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Type_H,Type_L,Type_M
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,298.1,308.6,1551,42.8,0,0,0,0,1
2,298.2,308.7,1408,46.3,3,0,0,1,0
3,298.1,308.5,1498,49.4,5,0,0,1,0
4,298.2,308.6,1433,39.5,7,0,0,1,0
5,298.2,308.7,1408,40.0,9,0,0,1,0
...,...,...,...,...,...,...,...,...,...
9996,298.8,308.4,1604,29.5,14,0,0,0,1
9997,298.9,308.4,1632,31.8,17,0,1,0,0
9998,299.0,308.6,1645,33.4,22,0,0,0,1
9999,299.0,308.7,1408,48.5,25,0,1,0,0


Correlation

In [7]:
corr_rank = data_cleaned.corr()['Target']
corr_rank.head(15)

Air temperature [K]        0.082556
Process temperature [K]    0.035946
Rotational speed [rpm]    -0.044188
Torque [Nm]                0.191321
Tool wear [min]            0.105448
Target                     1.000000
Type_H                    -0.023916
Type_L                     0.035643
Type_M                    -0.022432
Name: Target, dtype: float64

In [12]:
X

Unnamed: 0_level_0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Type_H,Type_L,Type_M
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,298.1,308.6,1551,42.8,0,0,0,1
2,298.2,308.7,1408,46.3,3,0,1,0
3,298.1,308.5,1498,49.4,5,0,1,0
4,298.2,308.6,1433,39.5,7,0,1,0
5,298.2,308.7,1408,40.0,9,0,1,0
...,...,...,...,...,...,...,...,...
9996,298.8,308.4,1604,29.5,14,0,0,1
9997,298.9,308.4,1632,31.8,17,1,0,0
9998,299.0,308.6,1645,33.4,22,0,0,1
9999,299.0,308.7,1408,48.5,25,1,0,0


In [8]:
data_cleaned.to_csv("../data/processed/predictive_maintenance.csv")

In [8]:
df

Unnamed: 0,UDI,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,1,M,298.1,308.6,1551,42.8,0,0
1,2,L,298.2,308.7,1408,46.3,3,0
2,3,L,298.1,308.5,1498,49.4,5,0
3,4,L,298.2,308.6,1433,39.5,7,0
4,5,L,298.2,308.7,1408,40.0,9,0
...,...,...,...,...,...,...,...,...
9995,9996,M,298.8,308.4,1604,29.5,14,0
9996,9997,H,298.9,308.4,1632,31.8,17,0
9997,9998,M,299.0,308.6,1645,33.4,22,0
9998,9999,H,299.0,308.7,1408,48.5,25,0
