<a href="https://colab.research.google.com/github/Hero0963/1000phone/blob/main/kaggle_binary_classification_of_machine_failures/machine_failures_nn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

rm: cannot remove '/root/.kaggle': No such file or directory


In [None]:
!kaggle competitions download -c playground-series-s3e17

Downloading playground-series-s3e17.zip to /content
  0% 0.00/3.49M [00:00<?, ?B/s]
100% 3.49M/3.49M [00:00<00:00, 220MB/s]


In [None]:
import zipfile
import pandas as pd

# 解壓縮zip檔案
with zipfile.ZipFile('/content/playground-series-s3e17.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/extracted_folder')


In [None]:
df_train = pd.read_csv('/content/extracted_folder/train.csv')

In [None]:
df_test = pd.read_csv('/content/extracted_folder/test.csv')

In [None]:
df_all = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
features = df_all.drop('Machine failure', axis=1)
label = df_all['Machine failure']

In [None]:
X = features
y = label

In [None]:
selected_features = ['HDF', 'OSF', 'PWF', 'TWF', 'Torque [Nm]']

In [None]:
X_s = X[selected_features]

#Normalize Features

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


# 標準化
scaler = StandardScaler()
features_normalized_standardized  = X_s.copy()
features_normalized_standardized = scaler.fit_transform(features_normalized_standardized)

In [None]:
df_train_data = df_all[df_all['Machine failure'].notnull()]
df_test_data = df_all[df_all['Machine failure'].isnull()]

In [None]:
X = df_train_data[selected_features]
y = df_train_data['Machine failure']

# OverSampling

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

# 創建Oversampling的實例
# 隨機過採樣（Random Over Sampling）
ros = RandomOverSampler(random_state=42)

# SMOTE過採樣（Synthetic Minority Over-sampling Technique）
smote = SMOTE(random_state=42)

# 使用Oversampling方法處理不平衡數據集
X_ros, y_ros = ros.fit_resample(X, y)  # 使用Random Over Sampling
X_smote, y_smote = smote.fit_resample(X, y)  # 使用SMOTE

# X_ros 和 y_ros 是經過Random Over Sampling後的新數據集
# X_smote 和 y_smote 是經過SMOTE過採樣後的新數據集

In [None]:
X = X_smote
y = y_smote

In [None]:
from sklearn.model_selection import train_test_split


train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate):
        super(Model, self).__init__()
        self.dropout_rate = dropout_rate
        self.num_layers = num_layers

        self.input_norm = nn.BatchNorm1d(input_size)

        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(input_size, hidden_size))

        for _ in range(num_layers - 1):
            self.hidden_layers.append(nn.Linear(hidden_size, hidden_size))

        self.predict = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.input_norm(x)

        for i in range(self.num_layers):
            x = self.hidden_layers[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout_rate, training=self.training)

        output = F.softmax(self.predict(x), dim=1)
        return output

# 模型、優化器初始化
input_size = train_X.shape[1]
hidden_size = 32
output_size = 2
num_layers = 3
dropout_rate = 0.1
model = Model(input_size, hidden_size, output_size, num_layers, dropout_rate)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.0) # 模型參數優化器
loss_func = torch.nn.BCELoss()

In [None]:
import tensorflow as tf
train_y_onehot = tf.keras.utils.to_categorical(train_y, num_classes=2)

In [None]:
# 資料格式轉成torch專用格式
x_data = torch.tensor(train_X.values, dtype=torch.float32)
y_data = torch.tensor(train_y_onehot, dtype=torch.float32)

In [None]:
batch_size = 128
num_epochs = 100
num_batches = len(train_X) // batch_size
loss_list = []

In [None]:
for epoch in range(num_epochs):
    for i in range(num_batches):
        start = i * batch_size
        end = start + batch_size
        prediction = model(x_data[start:end])
        loss = loss_func(prediction, y_data[start:end])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    loss = loss_func(model(x_data), y_data)
    loss_list.append(loss.item())




print(' ===== done =====')

print(loss_list)

 ===== done =====
[0.28374767303466797, 0.2791767716407776, 0.2769570052623749, 0.27599790692329407, 0.27504998445510864, 0.2747669517993927, 0.274265319108963, 49.94575881958008, 49.980831146240234, 49.936466217041016, 49.982547760009766, 49.8708381652832, 49.865718841552734, 49.96672058105469, 49.92203903198242, 49.902488708496094, 49.94763946533203, 49.8708381652832, 49.98580551147461, 49.8955078125, 49.96997833251953, 49.886199951171875, 49.9848747253418, 49.98440933227539, 49.969512939453125, 49.86013412475586, 49.96858215332031, 49.97742462158203, 49.97044372558594, 49.947174072265625, 50.069583892822266, 50.11845397949219, 50.119693756103516, 50.10356140136719, 50.10681915283203, 50.15010452270508, 50.14870834350586, 50.12776565551758, 50.152462005615234, 50.02583312988281, 50.14265823364258, 50.2236442565918, 50.070980072021484, 50.14731216430664, 50.2174186706543, 50.17198181152344, 50.186439514160156, 50.1445198059082, 50.249244689941406, 50.10356140136719, 50.10681915283203,

In [None]:
# 資料格式轉成torch專用格式
x_data = torch.tensor(test_X.values, dtype=torch.float32)

y_pred = model(x_data)
y_pred = y_pred.argmax(1) # one hot array to int array

In [None]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(test_y, y_pred))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     26879
         1.0       0.50      1.00      0.67     26834

    accuracy                           0.50     53713
   macro avg       0.25      0.50      0.33     53713
weighted avg       0.25      0.50      0.33     53713



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# ref: https://www.kaggle.com/code/kkhandekar/binary-classification-tensorflow-v-s-pytorch