# The Problem: What genre do the people in their country prefer

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [4]:
# load data 
music_data = pd.read_csv('../data/clean_data/country_genre.csv')
music_data.head(1)

Unnamed: 0,country,top_genre
0,Japan,Reggae


## Preprocessing

In [5]:
# find all the unique countries 
countries = music_data['country'].values
np.unique(countries)

array(['Australia', 'Brazil', 'Canada', 'France', 'Germany', 'India',
       'Japan', 'South Korea', 'UK', 'USA'], dtype=object)

In [6]:
# find all unique genres 
genres = music_data['top_genre'].values
np.unique(genres)

array(['Classical', 'Country', 'EDM', 'Hip-Hop', 'Jazz', 'Metal', 'Pop',
       'R&B', 'Reggae', 'Rock'], dtype=object)

In [8]:
# one-hot encode function
def one_hot_encode(data, num_classes):
    encoded = np.zeros((len(data), num_classes))
    encoded[np.arange(len(data)), data] = 1
    return encoded

# encode the categorical data 
country_encoder = LabelEncoder()
genre_encoder = LabelEncoder()

country_encoded = country_encoder.fit_transform(countries)
genre_encoded = genre_encoder.fit_transform(genres)


# Create one-hot encoded features 
x = one_hot_encode(country_encoded, len(country_encoder.classes_))  # input: Countries
y = one_hot_encode(genre_encoded, len(genre_encoder.classes_))      # output: Genres

print(f"Input shape: {x.shape}") # Should be (5000,10)
print(f"Output shape: {y.shape}") # Should be (5000,10)


Input shape: (5000, 10)
Output shape: (5000, 10)


In [10]:
# split the data (train 60% , validate 20%, test 20%)
x_train, x_temp, y_train, y_temp = train_test_split(x,y, test_size=0.4, random_state=42)
x_val,x_test,y_val,y_test = train_test_split(x_temp,y_temp, test_size=0.5, random_state=42)

print(f'Training set: {x_train.shape[0]} samples')
print(f'Validation set: {x_val.shape[0]} samples')
print(f'Test set: {x_test.shape[0]} samples')

Training set: 3000 samples
Validation set: 1000 samples
Test set: 1000 samples


## Creation of Neural Network

In [27]:
# Creation of the Neural Network class 
class ClassificationNeuralNetwork:
    """
    2-layer Neural Network specialized for Classification tasks
    - Multi-class output with softmax activation
    - Cross-entropy loss function
    - Accuracy evaluation metric
    """
    
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize classification neural network
        
        Args:
            input_size: Number of input features
            hidden_size: Number of hidden layer neurons
            output_size: Number of classes to predict
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        # Xavier initialization for stable training
        self.W1 = np.random.randn(input_size, hidden_size) / np.sqrt(input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) / np.sqrt(hidden_size)
        self.b2 = np.zeros((1, output_size))
        
    def relu(self, x):
        """ReLU activation: f(x) = max(0, x)"""
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        """ReLU derivative: f'(x) = 1 if x > 0, else 0"""
        return (x > 0).astype(float)
    
    def softmax(self, x):
        """
        Softmax activation for multi-class classification
        Converts logits to probability distribution
        """
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward_pass(self, X):
        """
        Forward propagation for classification:
        z1 = X @ W1 + b1
        a1 = ReLU(z1)
        z2 = a1 @ W2 + b2
        a2 = softmax(z2)  # Probability distribution
        """
        # Hidden layer
        self.z1 = X @ self.W1 + self.b1
        self.a1 = self.relu(self.z1)
        
        # Output layer with softmax
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.softmax(self.z2)
        
        return self.a2
    
    def cross_entropy_loss(self, y_true, y_pred):
        """
        Cross-entropy loss for multi-class classification:
        L = -Σ(y_true * log(y_pred)) / m
        """
        epsilon = 1e-15  # Prevent log(0)
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred)) / m
        return loss
    
    def backward_pass(self, X, y_true):
        """
        Backpropagation for classification:
        
        Output layer (softmax + cross-entropy):
        dL/dz2 = a2 - y_true
        
        Hidden layer:
        dL/da1 = dL/dz2 @ W2^T
        dL/dz1 = dL/da1 * ReLU'(z1)
        """
        m = X.shape[0]
        
        # Output layer gradients (softmax + cross-entropy derivative)
        dz2 = self.a2 - y_true
        dW2 = self.a1.T @ dz2 / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        # Hidden layer gradients
        da1 = dz2 @ self.W2.T
        dz1 = da1 * self.relu_derivative(self.z1)
        dW1 = X.T @ dz1 / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        return dW1, db1, dW2, db2
    
    def train(self, X_train, y_train, X_val, y_val, epochs=1000, learning_rate=0.01, early_stopping_patience=50):
        """Train classification neural network with validation monitoring"""
        train_losses = []
        val_losses = []
        val_accuracies = []
        
        best_val_loss = float('inf')
        patience_counter = 0
        best_weights = None
        
        for epoch in range(epochs):
            # Forward pass and training loss
            y_pred_train = self.forward_pass(X_train)
            train_loss = self.cross_entropy_loss(y_train, y_pred_train)
            train_losses.append(train_loss)
            
            # Backward pass and weight updates
            dW1, db1, dW2, db2 = self.backward_pass(X_train, y_train)
            
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            # Validation evaluation
            y_pred_val = self.forward_pass(X_val)
            val_loss = self.cross_entropy_loss(y_val, y_pred_val)
            val_losses.append(val_loss)
            
            val_accuracy = self.accuracy(X_val, y_val)
            val_accuracies.append(val_accuracy)
            
            # Early stopping logic
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_weights = {
                    'W1': self.W1.copy(), 'b1': self.b1.copy(),
                    'W2': self.W2.copy(), 'b2': self.b2.copy()
                }
            else:
                patience_counter += 1
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
            
            if patience_counter >= early_stopping_patience:
                print(f"Early stopping at epoch {epoch}. Best validation loss: {best_val_loss:.4f}")
                self.W1, self.b1 = best_weights['W1'], best_weights['b1']
                self.W2, self.b2 = best_weights['W2'], best_weights['b2']
                break
        
        return {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_accuracies': val_accuracies
        }
    
    def predict(self, X):
        """Get class predictions (argmax of probabilities)"""
        probabilities = self.forward_pass(X)
        return np.argmax(probabilities, axis=1)
    
    def predict_proba(self, X):
        """Get prediction probabilities"""
        return self.forward_pass(X)
    
    def accuracy(self, X, y_true):
        """Calculate classification accuracy"""
        predictions = self.predict(X)
        y_true_classes = np.argmax(y_true, axis=1)
        return np.mean(predictions == y_true_classes)

## Using the Neural Network

Used Rule of Thumb: 2/3 Rule


Country → Genre (16 neurons)

- Input complexity: 10 countries with different music cultures
- Output complexity: 10 genres with complex relationships
- Data size: 5000 samples (enough to support 16 neurons)
- Reasoning: Countries have nuanced cultural preferences that need more neurons to capture

In [28]:
# Network architecture 
input_size = x.shape[1]  # should be 10 (countries)
hidden_size = 16         # Hidden layer neurons
output_size = y.shape[1] # 10 should genres

print(f"Architecture: {input_size} -> {hidden_size} -> {output_size}")

Architecture: 10 -> 16 -> 10


In [29]:
# Initialize network
nn = ClassificationNeuralNetwork(input_size, hidden_size, output_size)

# Train the network with validation monitoring
training_history = nn.train(x_train, y_train, x_val, y_val, epochs=1000, learning_rate=0.01, early_stopping_patience=50)

# Evaluate performance on all three sets 
train_accuracy = nn.accuracy(x_train, y_train)
val_accuracy = nn.accuracy(x_val, y_val)
test_accuracy = nn.accuracy(x_test,y_test)

print('\nFINAL RESULTS')
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Epoch 0 | Train Loss: 2.3196 | Val Loss: 2.3165 | Val Acc: 0.0890
Epoch 100 | Train Loss: 2.3151 | Val Loss: 2.3140 | Val Acc: 0.0890
Epoch 200 | Train Loss: 2.3119 | Val Loss: 2.3124 | Val Acc: 0.1100
Epoch 300 | Train Loss: 2.3096 | Val Loss: 2.3114 | Val Acc: 0.1040
Epoch 400 | Train Loss: 2.3078 | Val Loss: 2.3108 | Val Acc: 0.0980
Epoch 500 | Train Loss: 2.3065 | Val Loss: 2.3104 | Val Acc: 0.0980
Epoch 600 | Train Loss: 2.3054 | Val Loss: 2.3101 | Val Acc: 0.0980
Epoch 700 | Train Loss: 2.3045 | Val Loss: 2.3099 | Val Acc: 0.1040
Epoch 800 | Train Loss: 2.3037 | Val Loss: 2.3098 | Val Acc: 0.1040
Epoch 900 | Train Loss: 2.3031 | Val Loss: 2.3097 | Val Acc: 0.1030

FINAL RESULTS
Training Accuracy: 0.1063
Validation Accuracy: 0.1030
Test Accuracy: 0.0880


## These results are negative results.
With a random baseline of 10%, the training accuracy resulted in 10.6%, validation was 10.3%, and the test accuracy was only 8.8%. Essentially the network did not learn anything.
The loss also saw a minimal decrease from 2.3196 → 2.3031 (only 0.02 improvement).

## Some possible solutions:
1. Going to a bigger network
2. Higher Learning Rate
3. More training
4. Better patience

In [34]:
# Network architecture 
input_size = x.shape[1]  # should be 10 (countries)
hidden_size = 16        # Hidden layer neurons
output_size = y.shape[1] # 10 should genres

print(f"Architecture: {input_size} -> {hidden_size} -> {output_size}")

Architecture: 10 -> 16 -> 10


In [40]:
# Initialize network
nn = ClassificationNeuralNetwork(input_size, hidden_size, output_size)

# Train the network with validation monitoring
training_history = nn.train(x_train, y_train, x_val, y_val, epochs=2000, learning_rate=0.05, early_stopping_patience=200)

# Evaluate performance on all three sets 
train_accuracy = nn.accuracy(x_train, y_train)
val_accuracy = nn.accuracy(x_val, y_val)
test_accuracy = nn.accuracy(x_test,y_test)

print('\nFINAL RESULTS')
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Epoch 0 | Train Loss: 2.3287 | Val Loss: 2.3268 | Val Acc: 0.1080
Epoch 100 | Train Loss: 2.3120 | Val Loss: 2.3170 | Val Acc: 0.1070
Epoch 200 | Train Loss: 2.3060 | Val Loss: 2.3141 | Val Acc: 0.1070
Epoch 300 | Train Loss: 2.3030 | Val Loss: 2.3126 | Val Acc: 0.1170
Epoch 400 | Train Loss: 2.3010 | Val Loss: 2.3115 | Val Acc: 0.1160
Epoch 500 | Train Loss: 2.2995 | Val Loss: 2.3106 | Val Acc: 0.1160
Epoch 600 | Train Loss: 2.2984 | Val Loss: 2.3098 | Val Acc: 0.1190
Epoch 700 | Train Loss: 2.2975 | Val Loss: 2.3093 | Val Acc: 0.1210
Epoch 800 | Train Loss: 2.2967 | Val Loss: 2.3088 | Val Acc: 0.1210
Epoch 900 | Train Loss: 2.2961 | Val Loss: 2.3085 | Val Acc: 0.1200
Epoch 1000 | Train Loss: 2.2956 | Val Loss: 2.3082 | Val Acc: 0.1240
Epoch 1100 | Train Loss: 2.2951 | Val Loss: 2.3080 | Val Acc: 0.1190
Epoch 1200 | Train Loss: 2.2947 | Val Loss: 2.3079 | Val Acc: 0.1190
Epoch 1300 | Train Loss: 2.2943 | Val Loss: 2.3078 | Val Acc: 0.1190
Epoch 1400 | Train Loss: 2.2939 | Val Loss: 2.

## What I choose to do 
1. Kept the neurons the same. I noticed when I increased the neurons to 32 it resulted in more overfitting and less training
1. Raised Learning Rate: 0.1 -> 0.5
2. Raised training: 1000 -> 2000 epochs
3. Better patience: 50 -> 200 epochs

## Better but still not great results 

The training accuracy moved up to 12.4%, the validation accuracy moved up to 12%, but the test accuracy is 11.8%. These tiny gaps mean that there is healthy learning without memorization.
Yet these results are still not nearly good enough to consider the model ready for production to make any good predictions.

## Why is this occurring

I decided to take another look at the data and found that the countries almost have no preference when it comes to music taste.
The best country saw that Germany loves EDM, with 15.1 percent of people stating this was their top choice. But otherwise the average preference across each country was around 12.3%. This gives a data variance of 2.03 which is extremely low.
Essentially countries like all genres almost equally, and there are barely any signals to learn from.

## Was this a failure or a success?
This was actually a success. The Neural Network was successfully built and is working properly.

- There are no strong patterns in the data.
- When the model was predicting randomly it gave baseline accuracy, signifying that with improvement we might see better results.
- When the network received tiny improvements it reported 12% accuracy which represents weak signals in the data.

So overall the issue is not the network but the data I am training on.

## What are the possible solutions?
1. Ask a different question
2. Use multiple features to reveal any hidden patterns
3. Accept the results
4. Get more data


In [44]:
# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("Model may be overfitting (train-test gap > 10%)")
else:
    print("Model shows good generalization")

print(f'\nBest epoch: {training_history["best_epoch"] if "best_epoch" in training_history else "N/A"}')
print(f'Training stopped after {len(training_history["train_losses"])} epochs')

# Show genre preferences by country (for classification)
print("\n" + "="*50)
print("PREDICTIONS BY COUNTRY")
print("="*50)

# Create input for each country 
country_inputs = np.eye(len(country_encoder.classes_))
genre_probs = nn.predict_proba(country_inputs)  # Use predict_proba for cleaner interface

for i, country in enumerate(country_encoder.classes_):
    predicted_genre_idx = np.argmax(genre_probs[i])
    predicted_genre = genre_encoder.classes_[predicted_genre_idx]
    confidence = genre_probs[i][predicted_genre_idx]

    print(f"{country}: {predicted_genre} ({confidence:.2%} confidence)")



Model shows good generalization

Best epoch: N/A
Training stopped after 1725 epochs

PREDICTIONS BY COUNTRY
Australia: Reggae (11.99% confidence)
Brazil: Rock (12.88% confidence)
Canada: EDM (11.99% confidence)
France: Jazz (12.66% confidence)
Germany: EDM (11.99% confidence)
India: Reggae (12.21% confidence)
Japan: Jazz (10.75% confidence)
South Korea: Rock (11.29% confidence)
UK: Reggae (11.88% confidence)
USA: Reggae (12.26% confidence)
