# COMP7103C Data Mining
PyTorch

In [1]:
import numpy as np
import pandas as pd

data_train = pd.read_csv('Train.csv')
data_val = pd.read_csv('Validate.csv')
data_test = pd.read_csv('Test.csv')

data = pd.concat([data_train, data_val, data_test])

# The overview of data
data.head()

Unnamed: 0,ID,Married,Gender,Age,Graduate,Profession,Years_of_Working,Spending_Score,Family_Members,Category,Class(Target)
0,0,1,1.0,36,,Artist,9.0,Low,2.0,Cat_6,C
1,1,0,1.0,49,1.0,Artist,1.0,Low,1.0,Cat_6,A
2,2,0,,25,0.0,Homemaker,8.0,Low,1.0,Cat_3,B
3,3,1,0.0,77,1.0,Lawyer,0.0,High,2.0,Cat_4,A
4,4,1,0.0,39,1.0,Entertainment,1.0,Average,4.0,Cat_3,A


In [2]:
print('Missing Values:')
data.isnull().sum()

Missing Values:


ID                      0
Married                 0
Gender                190
Age                     0
Graduate              102
Profession            162
Years_of_Working     1098
Spending_Score          0
Family_Members        448
Category              108
Class(Target)        2139
dtype: int64

In [3]:
# Handling Missing Values

numerical_features = ['Years_of_Working ', 'Family_Members']
for feature in numerical_features:
    median_value = data[feature].median()
    data[feature] = data[feature].fillna(median_value)
    print(f"Imputed missing values in '{feature}' with median: {median_value}")

# Gender
data['Gender'] = data['Gender'].fillna(0.5)

# Categorical Features Imputation
data['Graduate'] = data['Graduate'].fillna(0.5)
print(f"Imputed missing values in 'Graduate' with mode: {data['Graduate'].mode()[0]}")

# Impute 'Profession' and 'Category' with 'Unknown'
data['Profession'] = data['Profession'].fillna('Unknown')
data['Category'] = data['Category'].fillna('Unknown')
print("Imputed missing values in 'Profession' and 'Category' with 'Unknown'.")

# Verify no missing values remain
print("\nMissing Values After Imputation:\n", data.isnull().sum())

Imputed missing values in 'Years_of_Working ' with median: 1.0
Imputed missing values in 'Family_Members' with median: 3.0
Imputed missing values in 'Graduate' with mode: 1.0
Imputed missing values in 'Profession' and 'Category' with 'Unknown'.

Missing Values After Imputation:
 ID                      0
Married                 0
Gender                  0
Age                     0
Graduate                0
Profession              0
Years_of_Working        0
Spending_Score          0
Family_Members          0
Category                0
Class(Target)        2139
dtype: int64


In [4]:
from sklearn.preprocessing import LabelEncoder

x = data.drop(['ID', 'Class(Target)'], axis=1)
y = data['Class(Target)']

# Identify binary and non-binary categorical columns
binary_cols = ['Married']
non_binary_cols = ['Profession', 'Spending_Score', 'Category', 'Gender', 'Graduate']

# Convert binary categorical features to integers if necessary
for col in binary_cols:
    # Ensure binary columns are of integer type
    x[col] = x[col].astype(int)

# One-hot encode non-binary categorical features
x = pd.get_dummies(x, columns=non_binary_cols, drop_first=True)  # drop_first to avoid multicollinearity

print("\nFeatures after encoding:")
print(x.head())

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Converts categories like A, B, C to integers

print("\nEncoded target labels:", label_encoder.classes_)
print(y[:5])


Features after encoding:
   Married  Age  Years_of_Working   Family_Members  Profession_Doctor  \
0        1   36                9.0             2.0              False   
1        0   49                1.0             1.0              False   
2        0   25                8.0             1.0              False   
3        1   77                0.0             2.0              False   
4        1   39                1.0             4.0              False   

   Profession_Engineer  Profession_Entertainment  Profession_Executive  \
0                False                     False                 False   
1                False                     False                 False   
2                False                     False                 False   
3                False                     False                 False   
4                False                      True                 False   

   Profession_Healthcare  Profession_Homemaker  ...  Category_Cat_3  \
0                  

In [5]:
from sklearn.preprocessing import StandardScaler

numeric_features = ['Age', 'Years_of_Working ', 'Family_Members']
scaler = StandardScaler()
x[numeric_features] = scaler.fit_transform(x[numeric_features])

print("\nFeatures after scaling:")
x.head()


Features after scaling:


Unnamed: 0,Married,Age,Years_of_Working,Family_Members,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,...,Category_Cat_3,Category_Cat_4,Category_Cat_5,Category_Cat_6,Category_Cat_7,Category_Unknown,Gender_0.5,Gender_1.0,Graduate_0.5,Graduate_1.0
0,1,-0.447842,2.014771,-0.56549,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,True,False
1,0,0.327195,-0.447327,-1.230315,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,True
2,0,-1.103644,1.707008,-1.230315,False,False,False,False,False,True,...,True,False,False,False,False,False,True,False,False,False
3,1,1.996508,-0.755089,-0.56549,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,1,-0.268988,-0.447327,0.764161,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,True


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import copy

class UserDataset(Dataset):
    def __init__(self, features, labels):
        self.X = torch.tensor(features.to_numpy().astype(np.float32), dtype=torch.float32)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Split the data into training and testing sets
x_train: pd.DataFrame = copy.deepcopy(x[:7487])
y_train: np.array = copy.deepcopy(y[:7487])

x_val_test = x[7487:]
y_val_test = y[7487:]
x_val: pd.DataFrame = x_val_test[:1069]
y_val: np.array = y_val_test[:1069]

x_test: pd.DataFrame = x_val_test[-2139:]
y_test: np.array = y_val_test[-2139:]

# Create datasets
train_dataset = UserDataset(x_train, y_train)
val_dataset = UserDataset(x_val, y_val)
test_dataset = UserDataset(x_test, y_test)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Increased batch size for efficiency
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print("\nDataset and DataLoader created successfully.")

(2139, 26) (2139,)

Dataset and DataLoader created successfully.


In [7]:
import torch.nn as nn

class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # Added dropout for regularization
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# Determine input size and number of classes
input_size = x_train.shape[1]
hidden_size = 150  # Increased hidden size for better learning capacity
num_classes = 4

# Initialize the model
model = Classifier(input_size, hidden_size, num_classes)
print("\nModel Summary:")
print(model)


Model Summary:
Classifier(
  (fc1): Linear(in_features=26, out_features=150, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=150, out_features=4, bias=True)
)


In [9]:
from sklearn.utils import compute_class_weight
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"\nUsing device: {device}")

# Compute class weights if classes are imbalanced
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)
print("\nClass weights computed and applied to the loss function.")

# Training loop with validation
num_epochs = 100
best_val_accuracy = 0

for epoch in range(num_epochs):
    # Training Phase
    model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_loader)

    # Validation Phase
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total

    # Save best model
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_user_classifier.pth')

    # Print metrics every 10 epochs
    if (epoch+1) % 10 == 0 or epoch == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%')

print(f'\nBest Validation Accuracy: {best_val_accuracy:.2f}%')


Using device: cpu

Class weights computed and applied to the loss function.
Epoch [1/100], Loss: 1.2301, Val Loss: 1.2256, Val Accuracy: 44.43%
Epoch [10/100], Loss: 1.1826, Val Loss: 1.2037, Val Accuracy: 48.36%
Epoch [20/100], Loss: 1.1651, Val Loss: 1.1946, Val Accuracy: 47.90%
Epoch [30/100], Loss: 1.1587, Val Loss: 1.2026, Val Accuracy: 48.18%
Epoch [40/100], Loss: 1.1487, Val Loss: 1.2005, Val Accuracy: 47.80%
Epoch [50/100], Loss: 1.1449, Val Loss: 1.2027, Val Accuracy: 47.52%
Epoch [60/100], Loss: 1.1377, Val Loss: 1.2068, Val Accuracy: 47.99%
Epoch [70/100], Loss: 1.1343, Val Loss: 1.2006, Val Accuracy: 47.61%
Epoch [80/100], Loss: 1.1296, Val Loss: 1.2050, Val Accuracy: 46.87%
Epoch [90/100], Loss: 1.1305, Val Loss: 1.2033, Val Accuracy: 47.99%
Epoch [100/100], Loss: 1.1244, Val Loss: 1.2107, Val Accuracy: 48.55%

Best Validation Accuracy: 48.83%


In [20]:
# 7. Evaluating the Model
# Load the best model
model.load_state_dict(torch.load('best_user_classifier.pth', weights_only=True))
model.eval()

all_preds = []
all_labels = []
with torch.no_grad():
    for batch_X, batch_y in val_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

# Calculate Accuracy
test_accuracy = 100 * np.sum(np.array(all_preds) == np.array(all_labels)) / len(all_labels)
print(f'\nAccuracy of the model on the test set: {test_accuracy:.2f}%')

train_preds = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        outputs = model(batch_X)
        _, predicted = torch.max(outputs.data, 1)
        train_preds.extend(predicted.cpu().numpy())

results = []
for result in train_preds:
    if result == 0:
        results.append('A')
    elif result == 1:
        results.append('B')
    elif result == 2:
        results.append('C')
    else:
        results.append('D')

    # results.append(result + 'A')

data_test['Class(Target)'] = results
data_test.to_csv('Test_Result.csv')


Accuracy of the model on the test set: 48.83%
