# The Problem: What are by age what are the preferred listening times of users (Morning/Afternoon/Night)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Load Data


In [None]:
# Creation of the Neural Network class 
class ClassificationNeuralNetwork:
    """
    2-layer Neural Network specialized for Classification tasks
    - Multi-class output with softmax activation
    - Cross-entropy loss function
    - Accuracy evaluation metric
    """
    
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize classification neural network
        
        Args:
            input_size: Number of input features
            hidden_size: Number of hidden layer neurons
            output_size: Number of classes to predict
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Xavier initialization for stable training
        self.W1 = np.random.randn(input_size, hidden_size) / np.sqrt(input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) / np.sqrt(hidden_size)
        self.b2 = np.zeros((1, output_size))
        
    def relu(self, x):
        """ReLU activation: f(x) = max(0, x)"""
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        """ReLU derivative: f'(x) = 1 if x > 0, else 0"""
        return (x > 0).astype(float)
    
    def softmax(self, x):
        """
        Softmax activation for multi-class classification
        Converts logits to probability distribution
        """
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward_pass(self, X):
        """
        Forward propagation for classification:
        z1 = X @ W1 + b1
        a1 = ReLU(z1)
        z2 = a1 @ W2 + b2
        a2 = softmax(z2)  # Probability distribution
        """
        # Hidden layer
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.relu(self.z1)
        
        # Output layer with softmax
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.softmax(self.z2)
        
        return self.a2
    
    def cross_entropy_loss(self, y_true, y_pred):
        """
        Cross-entropy loss for multi-class classification:
        L = -Σ(y_true * log(y_pred)) / m
        """
        epsilon = 1e-15  # Prevent log(0)
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred)) / m
        return loss
    
    def backward_pass(self, X, y_true):
        """
        Backpropagation for classification:
        
        Output layer (softmax + cross-entropy):
        dL/dz2 = a2 - y_true
        
        Hidden layer:
        dL/da1 = dL/dz2 @ W2^T
        dL/dz1 = dL/da1 * ReLU'(z1)
        """
        m = X.shape[0]
        
        # Output layer gradients (softmax + cross-entropy derivative)
        dz2 = self.a2 - y_true
        dW2 = self.a1.T @ dz2 / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        # Hidden layer gradients
        da1 = dz2 @ self.W2.T
        dz1 = da1 * self.relu_derivative(self.z1)
        dW1 = X.T @ dz1 / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        return dW1, db1, dW2, db2
    
    def train(self, X_train, y_train, X_val, y_val, epochs=1000, learning_rate=0.01, early_stopping_patience=50):
        """Train classification neural network with validation monitoring"""
        train_losses = []
        val_losses = []
        val_accuracies = []
        
        best_val_loss = float('inf')
        patience_counter = 0
        best_weights = None
        
        for epoch in range(epochs):
            # Forward pass and training loss
            y_pred_train = self.forward_pass(X_train)
            train_loss = self.cross_entropy_loss(y_train, y_pred_train)
            train_losses.append(train_loss)
            
            # Backward pass and weight updates
            dW1, db1, dW2, db2 = self.backward_pass(X_train, y_train)
            
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            # Validation evaluation
            y_pred_val = self.forward_pass(X_val)
            val_loss = self.cross_entropy_loss(y_val, y_pred_val)
            val_losses.append(val_loss)
            
            val_accuracy = self.accuracy(X_val, y_val)
            val_accuracies.append(val_accuracy)
            
            # Early stopping logic
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_weights = {
                    'W1': self.W1.copy(), 'b1': self.b1.copy(),
                    'W2': self.W2.copy(), 'b2': self.b2.copy()
                }
            else:
                patience_counter += 1
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
            
            if patience_counter >= early_stopping_patience:
                print(f"Early stopping at epoch {epoch}. Best validation loss: {best_val_loss:.4f}")
                self.W1, self.b1 = best_weights['W1'], best_weights['b1']
                self.W2, self.b2 = best_weights['W2'], best_weights['b2']
                break
        
        return {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_accuracies': val_accuracies
        }
    
    def predict(self, X):
        """Get class predictions (argmax of probabilities)"""
        probabilities = self.forward_pass(X)
        return np.argmax(probabilities, axis=1)
    
    def predict_proba(self, X):
        """Get prediction probabilities"""
        return self.forward_pass(X)
    
    def accuracy(self, X, y_true):
        """Calculate classification accuracy"""
        predictions = self.predict(X)
        y_true_classes = np.argmax(y_true, axis=1)
        return np.mean(predictions == y_true_classes)

Rule of Thumb: 2/3 Rule

Country → Minutes (8 neurons)

- Simpler problem: Predicting single number (total minutes)
- Less complexity: Regression typically needs fewer neurons than classification
- Reasoning: Total streaming time has fewer patterns than genre preferences

In [None]:
# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("Model may be overfitting (train-test gap > 10%)")
else:
    print("Model shows good generalization")

print(f'\nBest epoch: {training_history["best_epoch"] if "best_epoch" in training_history else "N/A"}')
print(f'Training stopped after {len(training_history["train_losses"])} epochs')

# Show genre preferences by country (for classification)
print("\n" + "="*50)
print("PREDICTIONS BY COUNTRY")
print("="*50)

# Create input for each country 
country_inputs = np.eye(len(country_encoder.classes_))
genre_probs = nn.predict_proba(country_inputs)  # Use predict_proba for cleaner interface

for i, country in enumerate(country_encoder.classes_):
    predicted_genre_idx = np.argmax(genre_probs[i])
    predicted_genre = genre_encoder.classes_[predicted_genre_idx]
    confidence = genre_probs[i][predicted_genre_idx]

    print(f"{country}: {predicted_genre} ({confidence:.2%} confidence)")