# The Problem: What genre do the people in their country prefer

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [4]:
# load data 
music_data = pd.read_csv('../data/clean_data/country_genre.csv')
music_data.head(1)

Unnamed: 0,country,top_genre
0,Japan,Reggae


## Preprocessing

In [5]:
# find all the unique countries 
countries = music_data['country'].values
np.unique(countries)

array(['Australia', 'Brazil', 'Canada', 'France', 'Germany', 'India',
       'Japan', 'South Korea', 'UK', 'USA'], dtype=object)

In [6]:
# find all unique genres 
genres = music_data['top_genre'].values
np.unique(genres)

array(['Classical', 'Country', 'EDM', 'Hip-Hop', 'Jazz', 'Metal', 'Pop',
       'R&B', 'Reggae', 'Rock'], dtype=object)

In [8]:
# one-hot encode function
def one_hot_encode(data, num_classes):
    encoded = np.zeros((len(data), num_classes))
    encoded[np.arange(len(data)), data] = 1
    return encoded

# encode the categorical data 
country_encoder = LabelEncoder()
genre_encoder = LabelEncoder()

country_encoded = country_encoder.fit_transform(countries)
genre_encoded = genre_encoder.fit_transform(genres)


# Create one-hot encoded features 
x = one_hot_encode(country_encoded, len(country_encoder.classes_))  # input: Countries
y = one_hot_encode(genre_encoded, len(genre_encoder.classes_))      # output: Genres

print(f"Input shape: {x.shape}") # Should be (5000,10)
print(f"Output shape: {y.shape}") # Should be (5000,10)


Input shape: (5000, 10)
Output shape: (5000, 10)


### Creating training data (train, validate, test)

In [10]:
# split the data (train 60% , validate 20%, test 20%)
x_train, x_temp, y_train, y_temp = train_test_split(x,y, test_size=0.4, random_state=42)
x_val,x_test,y_val,y_test = train_test_split(x_temp,y_temp, test_size=0.5, random_state=42)

print(f'Training set: {x_train.shape[0]} samples')
print(f'Validation set: {x_val.shape[0]} samples')
print(f'Test set: {x_test.shape[0]} samples')

Training set: 3000 samples
Validation set: 1000 samples
Test set: 1000 samples


## Creation of Neural Network

In [19]:
# Creation of the Neural Network class 
class NeuralNetwork:
    """
    2-layer Neural Network from scratch
    Architecture: Input -> Hidden -> Output
    """
    
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights with small random values
        # xavier initialization: weights ~ N(0, 1/sqrt(fan_in))
        self.W1 = np.random.randn(input_size, hidden_size) / np.sqrt(input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) / np.sqrt(hidden_size)
        self.b2 = np.zeros((1, output_size))
        
    def relu(self, x):
        """ReLU activation: f(x) = max(0, x)"""
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        """ReLU derivative: f'(x) = 1 if x > 0, else 0"""
        return (x > 0).astype(float)
    
    def softmax(self, x):
        """Softmax activation for multi-class classification"""
        # Subtract max for numerical stability
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward_pass(self, x):
        """
        Forward propagation:
        z1 = x @ W1 + b1
        a1 = ReLU(z1)
        z2 = a1 @ W2 + b2
        a2 = softmax(z2)
        """
        # Hidden layer
        self.z1 = x @ self.W1 + self.b1
        self.a1 = self.relu(self.z1)
        
        # Output layer
        self.z2 = self.a1 @ self.W2 + self.b2
        self.a2 = self.softmax(self.z2)
        
        return self.a2
    
    def compute_loss(self, y_true, y_pred):
        """Cross-entropy loss: L = -Σ(y_true * log(y_pred))"""
        # Add small epsilon to prevent log(0)
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        m = y_true.shape[0]
        loss = -np.sum(y_true * np.log(y_pred)) / m
        return loss
    
    def backward_pass(self, x, y_true):
        """
        Backpropagation using chain rule:
        
        1. Output layer gradients:
            dL/dz2 = a2 - y_true (softmax + cross-entropy derivative)
            dL/dW2 = a1^T @ (a2 - y_true)
            dL/db2 = sum(a2 - y_true)
        
        2. Hidden layer gradients:
            dL/da1 = (a2 - y_true) @ W2^T
            dL/dz1 = dL/da1 * ReLU'(z1)
            dL/dW1 = x^T @ dL/dz1
            dL/db1 = sum(dL/dz1)
        """
        m = x.shape[0]
        
        # Output layer gradients
        dz2 = self.a2 - y_true
        dW2 = self.a1.T @ dz2 / m
        db2 = np.sum(dz2, axis=0, keepdims=True) / m
        
        # Hidden layer gradients
        da1 = dz2 @ self.W2.T
        dz1 = da1 * self.relu_derivative(self.z1)
        dW1 = x.T @ dz1 / m
        db1 = np.sum(dz1, axis=0, keepdims=True) / m
        
        return dW1, db1, dW2, db2
    
    def train(self, x_train, y_train, x_val, y_val, epochs=1000, learning_rate=0.01, early_stopping_patience=50):
        """Train the neural network with validation monitoring and early stopping"""
        train_losses = []
        val_losses = []
        val_accuracies = []
        
        best_val_loss = float('inf')
        patience_counter = 0
        best_weights = None
        
        for epoch in range(epochs):
            # Forward pass on training data
            y_pred_train = self.forward_pass(x_train)
            
            # Compute training loss
            train_loss = self.compute_loss(y_train, y_pred_train)
            train_losses.append(train_loss)
            
            # Backward pass and weight updates
            dW1, db1, dW2, db2 = self.backward_pass(x_train, y_train)
            
            # Update weights using gradient descent:
            # W = W - learning_rate * dL/dW
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            # Validation evaluation
            y_pred_val = self.forward_pass(x_val)
            val_loss = self.compute_loss(y_val, y_pred_val)
            val_losses.append(val_loss)
            
            val_accuracy = self.accuracy(x_val, y_val)
            val_accuracies.append(val_accuracy)
            
            # Early stopping logic
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                # Save best weights
                best_weights = {
                    'W1': self.W1.copy(),
                    'b1': self.b1.copy(),
                    'W2': self.W2.copy(),
                    'b2': self.b2.copy()
                }
            else:
                patience_counter += 1
            
            if epoch % 100 == 0:
                print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")
            
            # Early stopping
            if patience_counter >= early_stopping_patience:
                print(f"Early stopping at epoch {epoch}. Best validation loss: {best_val_loss:.4f}")
                # Restore best weights
                self.W1 = best_weights['W1']
                self.b1 = best_weights['b1']
                self.W2 = best_weights['W2']
                self.b2 = best_weights['b2']
                break
        
        return {
            'train_losses': train_losses,
            'val_losses': val_losses,
            'val_accuracies': val_accuracies,
            'best_epoch': len(train_losses) - patience_counter - 1
        }
    
    def predict(self, x):
        """Make predictions on new data"""
        predictions = self.forward_pass(x)
        return np.argmax(predictions, axis=1)
    
    def accuracy(self, x, y):
        """Calculate accuracy"""
        predictions = self.predict(x)
        y_true = np.argmax(y, axis=1)
        return np.mean(predictions == y_true)

## Using the Neural Network

Used Rule of Thumb: 2/3 Rule


Country → Genre (16 neurons)

- Input complexity: 10 countries with different music cultures
- Output complexity: 10 genres with complex relationships
- Data size: 5000 samples (enough to support 16 neurons)
- Reasoning: Countries have nuanced cultural preferences that need more neurons to capture

In [None]:
# Network architecture 
input_size = x.shape[1]  # should be 10 (countries)
hidden_size = 16         # Hidden layer neurons
output_size = y.shape[1] # 10 should genres

print(f"Architecture: {input_size} -> {hidden_size} -> {output_size}")

Architecture: 10 -> 16 -> 10


In [21]:
# Initialize network
nn = NeuralNetwork(input_size, hidden_size, output_size)

# Train the network with validation monitoring
training_history = nn.train(x_train, y_train, x_val, y_val, epochs=1000, learning_rate=0.01, early_stopping_patience=50)

# Evaluate performance on all three sets 
train_accuracy = nn.accuracy(x_train, y_train)
val_accuracy = nn.accuracy(x_val, y_val)
test_accuracy = nn.accuracy(x_test,y_test)

print('\nFINAL RESULTS')
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Epoch 0 | Train Loss: 2.3280 | Val Loss: 2.3243 | Val Acc: 0.1160
Epoch 100 | Train Loss: 2.3210 | Val Loss: 2.3200 | Val Acc: 0.1120
Epoch 200 | Train Loss: 2.3165 | Val Loss: 2.3176 | Val Acc: 0.1100
Epoch 300 | Train Loss: 2.3135 | Val Loss: 2.3162 | Val Acc: 0.1070
Epoch 400 | Train Loss: 2.3114 | Val Loss: 2.3153 | Val Acc: 0.1070
Epoch 500 | Train Loss: 2.3098 | Val Loss: 2.3147 | Val Acc: 0.1070
Epoch 600 | Train Loss: 2.3086 | Val Loss: 2.3143 | Val Acc: 0.1070
Epoch 700 | Train Loss: 2.3076 | Val Loss: 2.3140 | Val Acc: 0.1090
Epoch 800 | Train Loss: 2.3068 | Val Loss: 2.3138 | Val Acc: 0.1090
Epoch 900 | Train Loss: 2.3061 | Val Loss: 2.3136 | Val Acc: 0.1020

FINAL RESULTS
Training Accuracy: 0.1010
Validation Accuracy: 0.1030
Test Accuracy: 0.0970


## These results are negative results.

I expected the performance to be 10% accurate  (1 out of 10). But in reality I got -10% accuracy, so basically the model learned nothing.

## Some big red flags
- The loss barely decreases from 2.328 -> 2.306
- random-level accuracy: All -10% (train/val/test)
- Flat learning curve: No real improvement after 900 epochs

## Some possible solutions:
1. Going to a bigger network:  16 -> 32 neurons 
2. Higher Learning Rate: 0.1 - 0.5
3. More training: 1000 -> 2000 epochs
4. Better patience: 50 -> 200 epochs

In [22]:
# Initialize network
nn = NeuralNetwork(input_size, hidden_size, output_size)

# Train the network with validation monitoring
training_history = nn.train(x_train, y_train, x_val, y_val, epochs=2000, learning_rate=0.05, early_stopping_patience=200)

# Evaluate performance on all three sets 
train_accuracy = nn.accuracy(x_train, y_train)
val_accuracy = nn.accuracy(x_val, y_val)
test_accuracy = nn.accuracy(x_test,y_test)

print('\nFINAL RESULTS')
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Epoch 0 | Train Loss: 2.3191 | Val Loss: 2.3342 | Val Acc: 0.0990
Epoch 100 | Train Loss: 2.3089 | Val Loss: 2.3184 | Val Acc: 0.1060
Epoch 200 | Train Loss: 2.3053 | Val Loss: 2.3136 | Val Acc: 0.1060
Epoch 300 | Train Loss: 2.3031 | Val Loss: 2.3115 | Val Acc: 0.0940
Epoch 400 | Train Loss: 2.3014 | Val Loss: 2.3105 | Val Acc: 0.0950
Epoch 500 | Train Loss: 2.3002 | Val Loss: 2.3098 | Val Acc: 0.0950
Epoch 600 | Train Loss: 2.2992 | Val Loss: 2.3094 | Val Acc: 0.0950
Epoch 700 | Train Loss: 2.2984 | Val Loss: 2.3091 | Val Acc: 0.0950
Epoch 800 | Train Loss: 2.2977 | Val Loss: 2.3089 | Val Acc: 0.0950
Epoch 900 | Train Loss: 2.2972 | Val Loss: 2.3088 | Val Acc: 0.0940
Epoch 1000 | Train Loss: 2.2967 | Val Loss: 2.3087 | Val Acc: 0.0980
Epoch 1100 | Train Loss: 2.2962 | Val Loss: 2.3087 | Val Acc: 0.0990
Epoch 1200 | Train Loss: 2.2958 | Val Loss: 2.3086 | Val Acc: 0.0930
Epoch 1300 | Train Loss: 2.2955 | Val Loss: 2.3086 | Val Acc: 0.0930
Epoch 1400 | Train Loss: 2.2951 | Val Loss: 2.

## Better but still not great results 

The accuracy moved up to 12.1%, random baseline increased to 10.4% and the loss reduced to 2.29 which is still minimal learning. 

Yet these results are still not nearly as good enough to consider the model to be sent into production to make any good predictions. 

## Why is this occurring

I decided to take another look a the data and found that the countries almost have no preference when it comes to music taste. 

The best country saw that Germany loves EDM, with 15.1 percent of people stating this was their top choice. But otherwise the average preference across each country was around 12.3%. This gives a data variance of 2.03 which is extremely low. 

Essentially countries like all genres almost equally, and there are barely any signals to learn from. 

## Was this a failure or a success?
This was a actually success. The Neural Network was successfully built and is working properly.

- There are no strong patterns in teh data 
- When the model was prediction randomly it gave -10 accuracy signifying that with improvement we might see more better results.
- When teh network received tiny improvements it reported 12% accuracy which represents weak signals in the data. 

So overall the issue is not the network but the data I am training on. 

## What are the possible solutions?
1. Ask a different question
2. Use multiple features to reveal any hidden patterns
3. Accept the results
4. Get more data


In [23]:
# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("model may be overfitting (train-test gap> 10%)")
else:
    print("Model shows good generalization")

print(f'\nBest epoch: {training_history['best_epoch']}')
print(f'Training stopped after {len(training_history['train_losses'])} epochs')

# show genre preferences by country
# create input for each country 
country_inputs = np.eye(len(country_encoder.classes_))
genre_probs = nn.forward_pass(country_inputs)

for i, country in enumerate(country_encoder.classes_):
    predicted_genre_idx = np.argmax(genre_probs[i])
    predicted_genre = genre_encoder.classes_[predicted_genre_idx]
    confidence = genre_probs[i][predicted_genre_idx]

    print(f"{country}: {predicted_genre} ({confidence:.2f} confidence)")



Model shows good generalization

Best epoch: 1460
Training stopped after 1661 epochs
Australia: Classical (0.12 confidence)
Brazil: R&B (0.11 confidence)
Canada: R&B (0.11 confidence)
France: Reggae (0.12 confidence)
Germany: Metal (0.11 confidence)
India: Reggae (0.12 confidence)
Japan: Jazz (0.11 confidence)
South Korea: EDM (0.12 confidence)
UK: Jazz (0.12 confidence)
USA: Reggae (0.13 confidence)
