In [None]:
import torch
import torchvision.transforms as transforms
import torch.nn as nn

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data path
train_path = '/content/drive/My Drive/train.csv'
test_path = '/content/drive/My Drive/test.csv'

In [None]:
# Read CSV file into a dataframe
import pandas as pd
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
original_test_df = pd.read_csv(test_path)

In [None]:
# Convert dataframe to numpy array
train_array = train_df.values
test_array = test_df.values

In [None]:
print(train_array.shape)

(5343, 21)


In [None]:
# Drop the customerID column
train_df = train_df.drop('customerID', axis=1)

# Fill in missing values
train_df['TotalCharges'].fillna(train_df['TotalCharges'].median(), inplace=True)


# Dictionary to hold the LabelEncoders
label_encoders = {}

# Encode categorical features
from sklearn.preprocessing import LabelEncoder
categorical_features = [column for column in train_df.columns if train_df[column].dtype == 'object']
for column in categorical_features:
    le = LabelEncoder()
    train_df[column] = le.fit_transform(train_df[column])
    label_encoders[column] = le

# Seperate the features and the target
X = train_df.drop('Discontinued', axis=1)
y = train_df['Discontinued']

# Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Convert the numpy arrays to tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32)

# Create a dataloader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# Create a neural network
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(19, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

model = NeuralNetwork()

# Loss function and optimizer
loss = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
for epoch in range(20):
    for batch in train_loader:
        features, target = batch
        optimizer.zero_grad()
        output = model(features)
        loss_value = loss(output, target.view(-1, 1))
        loss_value.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss_value.item()}')

# Evaluate the model
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        features, target = batch
        output = model(features)
        pred = torch.round(output)
        total += target.size(0)
        correct += (pred == target.view(-1, 1)).sum().item()
print(f'Accuracy: {correct / total}')

# Save the model
torch.save(model.state_dict(), 'model.pth')
torch.save(scaler, 'scaler.pth')
torch.save(label_encoders, 'label_encoders.pth')

# calculate ROC AUC
from sklearn.metrics import roc_auc_score
y_pred = model(X_val).detach().numpy()
print(roc_auc_score(y_val, y_pred))

Epoch 1, Loss: 0.5426204800605774
Epoch 2, Loss: 0.5018996000289917
Epoch 3, Loss: 0.396027147769928
Epoch 4, Loss: 0.30742505192756653
Epoch 5, Loss: 0.43631598353385925
Epoch 6, Loss: 0.44387638568878174
Epoch 7, Loss: 0.40216830372810364
Epoch 8, Loss: 0.384438693523407
Epoch 9, Loss: 0.3928157687187195
Epoch 10, Loss: 0.4448930621147156
Epoch 11, Loss: 0.3709146976470947
Epoch 12, Loss: 0.26193302869796753
Epoch 13, Loss: 0.3507553040981293
Epoch 14, Loss: 0.3616562783718109
Epoch 15, Loss: 0.433519184589386
Epoch 16, Loss: 0.3560882806777954
Epoch 17, Loss: 0.5302623510360718
Epoch 18, Loss: 0.2673327922821045
Epoch 19, Loss: 0.3325665295124054
Epoch 20, Loss: 0.31982773542404175
Accuracy: 0.7932647333956969
0.815541751175545


In [None]:
# Test the model
test_df = test_df.drop('customerID', axis=1)

# Fill in missing values
test_df['TotalCharges'].fillna(test_df['TotalCharges'].median(), inplace=True)

# Exclude 'Discontinued' column
categorical_features = [column for column in categorical_features if column != 'Discontinued']

# Use saved label encoders to encode the categorical features
for column in categorical_features:
    le = label_encoders[column]
    test_df[column] = le.transform(test_df[column])

# Scale the features
X_test = test_df
X_test_scaled = scaler.transform(X_test)

# Convert the numpy arrays to tensors
X_test = torch.tensor(X_test_scaled, dtype=torch.float32)

In [None]:
# Predict probabilities
with torch.no_grad():
    predictions = model(X_test).detach().numpy()

print(predictions)

# Save the predictions
output_df = pd.DataFrame({
    'ID': original_test_df['customerID'],
    'TARGET': predictions.flatten()
})

output_df.to_csv('predictions.csv', index=False)

[[0.5738319 ]
 [0.23072152]
 [0.91608775]
 ...
 [0.01121193]
 [0.00475039]
 [0.29760617]]
