# The Problem: What are by age what are the preferred listening times of users (Morning/Afternoon/Night)

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

## Load Data


In [3]:
music_data = pd.read_csv('../data/clean_data/age_time_of_day.csv')

## Preprocessing 

In [4]:
age_feature = 'age'
target_feature = 'listening_time'

x_raw = music_data[age_feature].values.reshape(-1,1)
y_raw = music_data[target_feature].values

print(f"\nDataset Overview")
print(f"Total users: {len(music_data)}")
print(f"Age range: {x_raw.min():.0f} - {x_raw.max():.0f} years")
print(f"Average age: {x_raw.mean():.1f} years")


Dataset Overview
Total users: 5000
Age range: 13 - 60 years
Average age: 36.7 years


In [8]:
# Analyze target distribution 
unique_times, counts = np.unique(y_raw,return_counts=True)
print(f"\nListening Time Distributions:")
for time, count in zip(unique_times, counts):
    percentage = count / len(y_raw) * 100
    print(f"{time}: {counts} users ({percentage:.1f}%)")

# Check for class imbalance
max_count, min_count = counts.max(), counts.min()
imbalance_ratio = max_count / min_count
print(f"\nClass balance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 2:
    print("Moderate class imbalance detected")
else: 
    print('classes are well balanced')
    # Encode target labels to integers
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y_raw)

    print(f"\nTarget encoding")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"{class_name} -> {i}")



Listening Time Distributions:
Afternoon: [1634 1621 1745] users (32.7%)
Morning: [1634 1621 1745] users (32.4%)
Night: [1634 1621 1745] users (34.9%)

Class balance ratio: 1.08:1
classes are well balanced

Target encoding
Afternoon -> 0
Morning -> 1
Night -> 2


In [29]:
# one-hot encode targets for neural network
def one_hot_encode(data,num_classes):
    """Convert integer labels to one-hot vectors"""
    encoded = np.zeros((len(data),num_classes))
    encoded[np.arange(len(data)),data] = 1
    return encoded

y_onehot = one_hot_encode(y_encoded, len(label_encoder.classes_))

## Normalize age feature 
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_raw)

print(f"\nFeature preprocessing:")
print(f"Original age range: {x_raw.min():.0f} - {x_raw.max():.0f}")
print(f"Scaled age range: {x_scaled.min():.2f} - {x_scaled.max():.2f}")
print(f"Age mean after scaling: {x_scaled.mean():.3f}")
print(f"Age std after scaling: {x_scaled.std():.3f}")

print(f"\nData shapes:")
print(f"Input features (X): {x_scaled.shape}")
print(f"Output labels (y): {y_onehot.shape}")



Feature preprocessing:
Original age range: 13 - 60
Scaled age range: -1.72 - 1.70
Age mean after scaling: -0.000
Age std after scaling: 1.000

Data shapes:
Input features (X): (5000, 1)
Output labels (y): (5000, 3)


In [21]:
# Split data into train/val/test (60%/20%/20%)
x_train, x_temp, y_train, y_temp = train_test_split(x_scaled,y_onehot, test_size=0.4, random_state=42)
x_val,x_test,y_val,y_test = train_test_split(x_temp,y_temp, test_size=0.5, random_state=42)

print(f"\nStratified data splits:")
print(f"Training set: {x_train.shape[0]} users")
print(f"Validation set: {x_val.shape[0]} users")
print(f"Test set: {x_test.shape[0]} users")


Stratified data splits:
Training set: 3000 users
Validation set: 1000 users
Test set: 1000 users


In [22]:
# Creation of the Neural Network class 
class ClassificationNeuralNetwork:
    """
    2-layer Neural Network specialized for Classification tasks
    - Multi-class output with softmax activation
    - Cross-entropy loss function
    - Accuracy evaluation metric
    """
    
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize classification neural network
        
        Args:
            input_size: Number of input features
            hidden_size: Number of hidden layer neurons
            output_size: Number of classes to predict
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Xavier initialization for stable training
        self.W1 = np.random.randn(input_size, hidden_size) / np.sqrt(input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) / np.sqrt(hidden_size)
        self.b2 = np.zeros((1, output_size))
        
    def relu(self, x):
        """ReLU activation: f(x) = max(0, x)"""
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        """ReLU derivative: f'(x) = 1 if x > 0, else 0"""
        return (x > 0).astype(float)
    
    def softmax(self, x):
        """
        Softmax activation for multi-class classification
        Converts logits to probability distribution
        """
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward_pass(self, X):
        """
        Forward propagation for classification:
        z1 = X @ W1 + b1
        a1 = ReLU(z1)
        z2 = a1 @ W2 + b2
        a2 = softmax(z2)  # Probability distribution
        """
        # Hidden layer
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.relu(self.z1)
        
        # Output layer with softmax
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.softmax(self.z2)
        
        return self.a2
    
    def cross_entropy_loss(self, y_true, y_pred):
        """
        Cross-entropy loss for multi-class classification:
        L = -Σ(y_true * log(y_pred)) / m
        """
        epsilon = 1e-15  # Prevent log(0)
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred)) / m
        return loss
    
    def backward_pass(self, X, y_true):
        """
        Backpropagation for classification:
        
        Output layer (softmax + cross-entropy):
        dL/dz2 = a2 - y_true
        
        Hidden layer:
        dL/da1 = dL/dz2 @ W2^T
        dL/dz1 = dL/da1 * ReLU'(z1)
        """
        m = X.shape[0]
        
        # Output layer gradients (softmax + cross-entropy derivative)
        dz2 = self.a2 - y_true
        dW2 = self.a1.T @ dz2 / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        # Hidden layer gradients
        da1 = dz2 @ self.W2.T
        dz1 = da1 * self.relu_derivative(self.z1)
        dW1 = X.T @ dz1 / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        return dW1, db1, dW2, db2
    
    def train(self, X_train, y_train, X_val, y_val, epochs=1000, learning_rate=0.01, early_stopping_patience=50):
        """Train classification neural network with validation monitoring"""
        train_losses = []
        val_losses = []
        val_accuracies = []
        
        best_val_loss = float('inf')
        patience_counter = 0
        best_weights = None
        
        for epoch in range(epochs):
            # Forward pass and training loss
            y_pred_train = self.forward_pass(X_train)
            train_loss = self.cross_entropy_loss(y_train, y_pred_train)
            train_losses.append(train_loss)
            
            # Backward pass and weight updates
            dW1, db1, dW2, db2 = self.backward_pass(X_train, y_train)
            
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            # Validation evaluation
            y_pred_val = self.forward_pass(X_val)
            val_loss = self.cross_entropy_loss(y_val, y_pred_val)
            val_losses.append(val_loss)
            
            val_accuracy = self.accuracy(X_val, y_val)
            val_accuracies.append(val_accuracy)
            
            # Early stopping logic
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_weights = {
                    'W1': self.W1.copy(), 'b1': self.b1.copy(),
                    'W2': self.W2.copy(), 'b2': self.b2.copy()
                }
            else:
                patience_counter += 1
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
            
            if patience_counter >= early_stopping_patience:
                print(f"Early stopping at epoch {epoch}. Best validation loss: {best_val_loss:.4f}")
                self.W1, self.b1 = best_weights['W1'], best_weights['b1']
                self.W2, self.b2 = best_weights['W2'], best_weights['b2']
                break
        
        return {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_accuracies': val_accuracies
        }
    
    def predict(self, X):
        """Get class predictions (argmax of probabilities)"""
        probabilities = self.forward_pass(X)
        return np.argmax(probabilities, axis=1)
    
    def predict_proba(self, X):
        """Get prediction probabilities"""
        return self.forward_pass(X)
    
    def accuracy(self, X, y_true):
        """Calculate classification accuracy"""
        predictions = self.predict(X)
        y_true_classes = np.argmax(y_true, axis=1)
        return np.mean(predictions == y_true_classes)

In [23]:
# Network architecture
input_size = x_scaled.shape[1]    # 1 (age)
hidden_size = 10                  # Hidden layer neurons
output_size = len(label_encoder.classes_)  # 3 (Morning/Afternoon/Night)

print(f"Architecture: {input_size} -> {hidden_size} -> {output_size}")
print(f"Input: Age (normalized)")
print(f"Output: {len(label_encoder.classes_)} listening time categories")

# Calculate parameters
total_params = input_size * hidden_size + hidden_size * output_size + hidden_size + output_size
print(f"Total parameters: {total_params}")
print(f"Data points per parameter: {len(x_scaled) / total_params:.1f}")

Architecture: 1 -> 10 -> 3
Input: Age (normalized)
Output: 3 listening time categories
Total parameters: 53
Data points per parameter: 94.3


In [27]:
# Initialize network
nn = ClassificationNeuralNetwork(input_size, hidden_size, output_size)

# Train the network
training_history = nn.train(x_train, y_train, x_val, y_val, epochs=2000, learning_rate=0.1, early_stopping_patience=200)


# Evaluate performance on all three sets
train_accuracy = nn.accuracy(x_train, y_train)
val_accuracy = nn.accuracy(x_val, y_val)
test_accuracy = nn.accuracy(x_test, y_test)

print(f"\nFINAL RESULTS:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Calculate baseline accuracy (random guessing)
random_baseline = 1.0 / len(label_encoder.classes_)
print(f"Random baseline: {random_baseline:.4f} ({random_baseline*100:.1f}%)")

Epoch 0 | Train Loss: 1.1412 | Val Loss: 1.1421 | Val Acc: 0.3350
Epoch 100 | Train Loss: 1.0983 | Val Loss: 1.0992 | Val Acc: 0.3300
Epoch 200 | Train Loss: 1.0982 | Val Loss: 1.0989 | Val Acc: 0.3460
Epoch 300 | Train Loss: 1.0981 | Val Loss: 1.0988 | Val Acc: 0.3460
Epoch 400 | Train Loss: 1.0981 | Val Loss: 1.0988 | Val Acc: 0.3470
Epoch 500 | Train Loss: 1.0981 | Val Loss: 1.0988 | Val Acc: 0.3470
Epoch 600 | Train Loss: 1.0981 | Val Loss: 1.0989 | Val Acc: 0.3450
Early stopping at epoch 664. Best validation loss: 1.0988

FINAL RESULTS:
Training Accuracy: 0.3447
Validation Accuracy: 0.3470
Test Accuracy: 0.3660
Random baseline: 0.3333 (33.3%)


# This model actual performed the best out of all the other problems.
The model shows that it is actually learning:
Current classification: 36.6% vs 33.3% random = 10% relative improvement!

There is good generalization meaning there is no overfitting with the test performing the best:
Training:   34.5%
Validation: 34.7%  
Test:       36.6%   <- Best performance!

The learning is also stable showing consistent, and steady improvement: 
Epoch 0:   33.5% (random level)
Epoch 664: 34.7% (learned patterns)

However it is not perfect:
- Only 3.3 percentage points above random
    * Could be that age alone does not strongly predict listening preferences
- Learning plateaus
    * Epoch 200-600: Accuracy stuck around 34.7%
    * It was able to find weak patterns quickly, then it plateaued
- Minimal Loss Reduction:
    * Loss: 1.14 -> 1.10 
    * The small decrease suggests limited learnable signal in the data

I am satisfied with these results because it shows:
- Age has some influence on listening time preferences 
- But age alone is not a strong predictor
- The relationship exists but is weak

What I could do is go back and add in some more features to see if there are other correlations. 

In [None]:
# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("Model may be overfitting (train-test gap > 10%)")
else:
    print("Model shows good generalization")

# Show listening time predictions across age spectrum
print("\n" + "="*60)
print("LISTENING TIME PREDICTIONS BY AGE")
print("="*60)

age_range = np.arange(18, 61, 5)  # Ages 18, 23, 28, ..., 58
print(f"Analyzing listening preferences across age spectrum:")

print(f"\n{'Age':<5} {'Morning':<10} {'Afternoon':<10} {'Night':<10} {'Predicted':<12}")
print("-" * 55)

for age in age_range:
    # Normalize age using same scaler
    age_normalized = scaler.transform([[age]])
    
    # Get predictions
    probabilities = nn.predict_proba(age_normalized)[0]
    predicted_class = np.argmax(probabilities)
    predicted_time = label_encoder.classes_[predicted_class]
    
    # Display probabilities and prediction
    morning_prob = probabilities[0] if label_encoder.classes_[0] == 'Morning' else probabilities[np.where(label_encoder.classes_ == 'Morning')[0][0]]
    afternoon_prob = probabilities[1] if label_encoder.classes_[1] == 'Afternoon' else probabilities[np.where(label_encoder.classes_ == 'Afternoon')[0][0]]
    night_prob = probabilities[2] if label_encoder.classes_[2] == 'Night' else probabilities[np.where(label_encoder.classes_ == 'Night')[0][0]]
    
    print(f"{age:<5} {morning_prob:<10.3f} {afternoon_prob:<10.3f} {night_prob:<10.3f} {predicted_time:<12}")

Model shows good generalization

LISTENING TIME PREDICTIONS BY AGE
Analyzing listening preferences across age spectrum:

Age   Morning    Afternoon  Night      Predicted   
-------------------------------------------------------
18    0.318      0.338      0.344      Night       
23    0.325      0.334      0.341      Night       
28    0.332      0.329      0.339      Night       
33    0.340      0.324      0.336      Morning     
38    0.330      0.319      0.351      Night       
43    0.330      0.320      0.350      Night       
48    0.329      0.322      0.349      Night       
53    0.329      0.324      0.348      Night       
58    0.328      0.325      0.347      Night       
