## Preprocessing

1. replace [ ] in columns
2. creating dummy variables
3. create over, under, and combination 

In [87]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [101]:
train = pd.read_csv('train.csv', header=0).drop('Product ID', axis=1)
test = pd.read_csv('test.csv', header=0).drop('Product ID', axis=1)

In [102]:
train.columns = train.columns.str.replace(r'\[.*?\]', '', regex=True)
test.columns = test.columns.str.replace(r'\[.*?\]', '', regex=True)
print(train.shape)
print(test.shape)

(136429, 13)
(90954, 12)


In [103]:
train

Unnamed: 0,id,Type,Air temperature,Process temperature,Rotational speed,Torque,Tool wear,Machine failure,TWF,HDF,PWF,OSF,RNF
0,0,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136424,136424,M,300.1,311.4,1530,37.5,210,0,0,0,0,0,0
136425,136425,H,297.5,308.5,1447,49.1,2,0,0,0,0,0,0
136426,136426,L,300.5,311.8,1524,38.5,214,0,0,0,0,0,0
136427,136427,L,301.7,310.9,1447,46.3,42,0,0,0,0,0,0


In [104]:
target = 'Machine failure'
cat_features = ['Type', 'TWF', 'HDF',
                'PWF', 'OSF','RNF', 'Tool wear ']
num_features = ['Product ID', 'Air temperature',
                'Process temperature', 'Rotational speed', 'Torque']

In [105]:
train = pd.get_dummies(train, columns= cat_features, dtype=int).fillna(0)
test = pd.get_dummies(test, columns= cat_features, dtype=int).fillna(0)
print(train.shape)
print(test.shape)

(136429, 265)
(90954, 264)


In [106]:
train.drop('id', axis=1).to_csv('data/train.csv', index=False)
test.drop('id', axis=1).to_csv('data/test.csv', index=False)

In [107]:
X = train.drop(['Machine failure'], axis=1)
y = train['Machine failure'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [108]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
combined_data_smote = pd.concat([pd.DataFrame(X_train_smote), pd.Series(y_train_smote)], axis=1)

In [109]:
under_sampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = under_sampler.fit_resample(X_train, y_train)
combined_data_under = pd.concat([pd.DataFrame(X_train_under), pd.Series(y_train_under)], axis=1)

In [110]:
pipeline = make_pipeline(SMOTE(random_state=42), RandomUnderSampler(random_state=42))
X_train_resampled, y_train_resampled = pipeline.fit_resample(X_train, y_train)
combined_data_resampled = pd.concat([pd.DataFrame(X_train_resampled), pd.Series(y_train_resampled)], axis=1)

In [111]:
combined_data_smote.to_csv('data/combined_data_smote.csv', index=False)
combined_data_under.to_csv('data/combined_data_under.csv', index=False)
combined_data_resampled.to_csv('data/combined_data_resampled.csv', index=False)