<a href="https://colab.research.google.com/github/HoseinNekouei/PyTorch_Binary-Prediction-of-Poisonous-Mushrooms/blob/main/Torch_binary_prediction_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Dataset**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir /content/data

In [4]:
!cp /content/drive/MyDrive/dataset/playground-series-s4e8_2.zip /content/data

cp: cannot stat '/content/drive/MyDrive/dataset/playground-series-s4e8_2.zip': No such file or directory


In [5]:
!unzip /content/data/playground-series-s4e8_2.zip -d /content/data

unzip:  cannot find or open /content/data/playground-series-s4e8_2.zip, /content/data/playground-series-s4e8_2.zip.zip or /content/data/playground-series-s4e8_2.zip.ZIP.


In [6]:
# !rm /content/test.csv
# !rm /content/train.csv
# !rm /content/sample_submission.csv

# **Import Library**

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.metrics import matthews_corrcoef
import joblib  # For saving and loading the encoder

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# **Step1: Data**




## **Preprocessing**

### *Load Train set*




In [10]:
train_df= pd.read_csv('/content/data/train.csv')
train_df= train_df.drop(columns=['id'])
train_df

FileNotFoundError: [Errno 2] No such file or directory: '/content/data/train.csv'

In [None]:
test_df= pd.read_csv('/content/data/test.csv')
test_df= test_df.drop(columns=['id'])
test_df.insert(0, 'class', 'z')
test_df.info()

In [None]:
test_df.info()

### *NaN*

In [None]:
train_df.isna().mean()

In [None]:
test_df.isna().mean()

In [None]:
missing_threshold= 0.85

high_missing_train_columns= train_df.columns[train_df.isna().mean() > missing_threshold]
high_missing_train_columns

train_df= train_df.drop(columns= high_missing_train_columns)
train_df.head()

In [None]:
high_missing_test_columns= test_df.columns[test_df.isna().mean() > missing_threshold]
print(high_missing_test_columns)

test_df= test_df.drop(columns= high_missing_test_columns)
test_df.head()

In [None]:
train_mode_values= train_df.mode()
train_mode_values= train_mode_values.loc[0].to_dict()
train_mode_values

In [None]:
test_mode_values= test_df.mode()
test_mode_values= test_mode_values.loc[0].to_dict()
test_mode_values

In [None]:
train_median_values= train_df.median(numeric_only= True).to_dict()
train_median_values

In [None]:
test_median_values= test_df.median(numeric_only= True).to_dict()
test_median_values

In [None]:
for column in train_df:
  if train_df[column].isna().any():
    if train_df[column].dtype== 'object':
      train_df[column]= train_df[column].fillna(train_mode_values[column])
    else:
      train_df[column]= train_df[column].fillna(train_median_values[column])

train_df.isna().median()

In [None]:
for column in test_df:
  if test_df[column].isna().any():
    if test_df[column].dtype== 'object':
      test_df[column]= test_df[column].fillna(test_mode_values[column])
    else:
      test_df[column]= test_df[column].fillna(test_median_values[column])

test_df.isna().median()

### *Split Dataframe to train and validation*

In [None]:
num_samples, num_features= train_df.shape
num_features-= 1
num_classes= len(train_df['class'].unique())
print(f'num_sample: {num_samples}, num_features: {num_features}, num_classes: {num_classes}')

In [None]:
train_df, val_df= train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['class'])
print(f'train_df: {train_df.shape}, val_df: {val_df.shape}')

### *Encoding*

In [None]:
ordinal_encoder= OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value= -1)
cat_col_name= train_df.select_dtypes(include=['object']).columns

print(cat_col_name)

train_df[cat_col_name]= ordinal_encoder.fit_transform(train_df[cat_col_name].astype(str))
val_df[cat_col_name]= ordinal_encoder.transform(val_df[cat_col_name].astype(str))

print(train_df.head())

In [None]:
test_df[cat_col_name]= ordinal_encoder.transform(test_df[cat_col_name].astype(str))
test_df[cat_col_name]

print(train_df.head())

In [None]:
# # Delete unused variable
# del train_df

# # Force  garbage collection
# import gc
# gc.collect()

In [None]:
# Check current RAM usage
import psutil
ram_usage = psutil.virtual_memory()
print(f"Used RAM: {ram_usage.used / (1024 ** 3):.2f} GB")
print(f"Available RAM: {ram_usage.available / (1024 ** 3):.2f} GB")

### *Normalization*

In [None]:
# scaler= StandardScaler()
# num_features_list= list(train_median_values.keys())
# num_features_list

# train_df[num_features_list]= scaler.fit_transform(train_df[num_features_list])
# val_df[num_features_list]= scaler.transform(val_df[num_features_list])

# test_df[num_features_list]= scaler.transform(test_df[num_features_list])
# train_df.head(2)
# test_df.head(2)

### *HiTorch*

In [None]:
X_train = train_df.drop(columns=['class'], axis=1).values
y_train= train_df['class'].values.reshape(-1,1)

X_val= val_df.drop(columns=['class'], axis=1).values
y_val= val_df['class'].values.reshape(-1, 1)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
X_train= torch.tensor(X_train, dtype= torch.float32)
y_train= torch.tensor(y_train, dtype= torch.float32)

X_val= torch.tensor(X_val, dtype= torch.float32)
y_val= torch.tensor(y_val, dtype= torch.float32)

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_train[1]

In [None]:
X_test= test_df.drop(columns=['class'],axis=1).values
y_test= test_df['class'].values.reshape(-1, 1)

X_test.shape, y_test.shape

In [None]:
X_test= torch.tensor(X_test, dtype= torch.float32)
y_test= torch.tensor(y_test, dtype= torch.float32)

In [None]:
train_set= TensorDataset(X_train, y_train)
val_set= TensorDataset(X_val, y_val)
test_set= TensorDataset(X_test, y_test)

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

In [None]:
train_loader= DataLoader(train_set, batch_size=128, shuffle= True)
val_loader= DataLoader(val_set, batch_size= 128)
test_loader= DataLoader(test_set, batch_size= 128)

# **Step2: Model**

In [None]:
nf_hidden_layer1= 8 * num_features
nf_hidden_layer2= 4 * num_features

model = nn.Sequential(
    nn.Linear(in_features= num_features, out_features= nf_hidden_layer1),
    nn.ReLU(),
    nn.Linear(in_features= nf_hidden_layer1, out_features= nf_hidden_layer2),
    nn.ReLU(),
    nn.Linear(in_features= nf_hidden_layer2, out_features= num_features),
    nn.ReLU(),
    nn.Linear(num_features, num_classes-1),
    nn.Sigmoid()
)

model.to(device)

# **Step3: Loss function**

In [None]:
loss_fn= nn.BCELoss()
loss_fn

# **Step4: Optimizer**

In [None]:
opt= torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
opt

# **Step5: Train loop**

In [None]:
def matthews_correlation_coefficient(y_true, y_pred):
    """
    Calculate Matthews Correlation Coefficient (MCC) on GPU using PyTorch.

    Args:
        y_true (torch.Tensor): Ground truth labels (binary, 0 or 1).
        y_pred (torch.Tensor): Predicted labels (binary, 0 or 1).

    Returns:
        mcc (torch.Tensor): MCC value.
    """

    # Calculate confusion matrix components
    TP = ((y_true == 1) & (y_pred == 1)).sum().float()
    TN = ((y_true == 0) & (y_pred == 0)).sum().float()
    FP = ((y_true == 0) & (y_pred == 1)).sum().float()
    FN = ((y_true == 1) & (y_pred == 0)).sum().float()

    # Calculate numerator and denominator
    numerator = (TP * TN) - (FP * FN)
    denominator = torch.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

    # Handle division by zero
    if denominator == 0:
        return torch.tensor(0.0, device=y_true.device)
    else:
        return numerator / denominator

In [None]:
best_loss_val = 10000
loss_train_hist, loss_val_hist, loss_test_hist =[], []
acc_train_hist, acc_val_hist, acc_test_hist= [], []

In [None]:
epochs= 30

for epoch in range(epochs):
  mean_loss_train, mean_acc_train, train_mcc= 0, 0, 0
  mean_loss_val, mean_acc_val, val_mcc= 0, 0, 0

  for x_batch, y_batch in train_loader:

    #GPU
    x_batch = x_batch.to(device)
    y_batch= y_batch.to(device)

    #model
    y_hat= model(x_batch)

    #loss
    loss= loss_fn(y_hat, y_batch)

    # gradient
    loss.backward()

    # update
    opt.step()
    opt.zero_grad()

    mean_loss_train += loss.item() * len(x_batch)
    mean_acc_train += torch.sum(y_hat.round() == y_batch).item()
    # train_mcc += matthews_correlation_coefficient(y_batch, y_hat.round())

  mean_loss_train /= len(train_set)
  mean_acc_train /= len(train_set)
  # train_mcc /= len(train_set)
  loss_train_hist.append(mean_loss_train)
  acc_train_hist.append(mean_acc_train)


  with torch.no_grad():
    for x_batch, y_batch in val_loader:

      #GPU
      x_batch = x_batch.to(device)
      y_batch= y_batch.to(device)

      y_hat= model(x_batch)

      loss= loss_fn(y_hat, y_batch)

      mean_loss_val += loss.item() * len(x_batch)
      mean_acc_val += torch.sum(y_hat.round() == y_batch).item()
      # val_mcc += matthews_correlation_coefficient(y_batch, y_hat.round())
      # print(f'Matthews Correlation Coefficient: {val_mcc:.3f}')

    mean_loss_val /= len(val_set)
    mean_acc_val /= len(val_set)
    # val_mcc /= len(val_set)
    loss_val_hist.append(mean_loss_val)
    acc_val_hist.append(mean_acc_val)

  print(f'epoch[{epoch}]: '
      f'Train_loss: {mean_loss_train:.3f} ,'
      f'Train_acc: {mean_acc_train:.3f} ,'
      # f'mcc: {train_mcc:.3f} '
      f'val_loss: {mean_loss_val:.3f} ,'
      f'val_acc: {mean_acc_val:.3f} ,'
      # f'mcc: {val_mcc:.3f}'
      )

  if mean_loss_val < best_loss_val:
    best_loss_val = mean_loss_val
    print('model saved!')
    print()
    torch.save(model,'/content/drive/MyDrive/Projects/best_model.pt')

In [None]:
plt.plot(torch.arange(epochs), loss_train_hist)
plt.plot(torch.arange(epochs), loss_val_hist)

plt.legend(["Train", "Valid"]);

In [None]:
plt.plot(torch.arange(epochs), acc_train_hist)
plt.plot(torch.arange(epochs), acc_val_hist)
plt.legend(["Train", "Valid"]);

In [None]:
model = torch.load("best-model.pt")

In [None]:
predictions = []

with torch.no_grad():
  for x_batch, y_batch in test_loader:
    y_hat = model(x_batch)
    predictions.extend(y_hat.cpu().numpy())


# Generate IDs starting from 3116945
start_id = 3116945
ids = list(range(start_id, start_id + len(predictions)))

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame({
    'ID': ids,  # Add ID column
    'Prediction': predictions  # Add prediction column
})


In [None]:
import ast

In [None]:
predictions_df['Prediction'] = predictions_df['Prediction'].apply(lambda x: int(ast.literal_eval(x)[0]))
predictions_df

In [None]:
label_map= {0: 'e', 1: 'p'}

predictions_df['Prediction']= predictions_df['Prediction'].apply(lambda x: label_map[x])
predictions_df

In [None]:
predictions_df= predictions_df.rename(columns= {'ID': 'id', 'Prediction': 'class'})
predictions_df

In [None]:
# Save to Excel
predictions_df.to_csv('/content/drive/MyDrive/dataset/playground-series-s4e8_2.csv', index=False)

print("Predictions with IDs saved to 'playground-series-s4e8_2.csv'")