In [5]:
import pandas as pd
import numpy as np
# Define the number of rows and columns
rows, cols = 50, 4
# Generate random data based on existing columns
data = {
    "age": np.random.randint(18, 60, size=rows),
    "gender": np.random.choice(["Male", "Female"], size=rows),
    "genre": np.random.choice(["HipHop", "Acoustic", "Reggae", "Classic", "Amapiano", "Jazz", "Dancehall"], size=rows),
    "obsession": np.random.choice(["High", "Moderate", "Low"], size=rows),
}
# Create a new DataFrame
kmp = pd.DataFrame(data)
kmp.describe() # Display the first 10 rows of the DataFrame

Unnamed: 0,age
count,50.0
mean,40.18
std,10.368103
min,20.0
25%,33.0
50%,43.0
75%,49.0
max,57.0


In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
# kmp = pd.read_excel('MusicKeen.xlsx')

X = kmp.drop(columns=['genre'])
y = kmp['genre']

# Define features
numerical_features = ['age']
categorical_features = ['gender', 'obsession']
ordinal_features = {'obsession': ['Low', 'Moderate', 'High']}

# Split data BEFORE any encoding/preprocessing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- PREPROCESSING PIPELINE ---

# Create copies for training and test data
train_encoded = X_train.copy()
test_encoded = X_test.copy()

# Ordinal encoding (fit only on training data)
ordinal_encoders = {}
for feature, categories in ordinal_features.items():
    encoder = OrdinalEncoder(categories=[categories])
    train_encoded[feature] = encoder.fit_transform(X_train[[feature]])
    test_encoded[feature] = encoder.transform(X_test[[feature]])
    ordinal_encoders[feature] = encoder

# Identify non-ordinal categorical features
remaining_categoricals = [f for f in categorical_features if f not in ordinal_features]

# ColumnTransformer setup
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), remaining_categoricals),
        ('ord', 'passthrough', list(ordinal_features.keys()))
    ]
)

# Label encode target (fit only on training data)
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Apply preprocessing (fit only on training data)
X_train_transformed = preprocessor.fit_transform(train_encoded)
X_test_transformed = preprocessor.transform(test_encoded)

# --- MODEL TRAINING & EVALUATION ---
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_transformed, y_train_encoded)

# Evaluate on test set
test_pred = model.predict(X_test_transformed)
test_accuracy = accuracy_score(y_test_encoded, test_pred)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

# Evaluate on training set (for potential overfitting check)
train_pred = model.predict(X_train_transformed)
train_accuracy = accuracy_score(y_train_encoded, train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# --- SAMPLE PREDICTION ---
sample_data = {'age': 24, 'gender': 'Female', 'obsession': 'High'}
sample = pd.DataFrame([sample_data])

# Encode sample using pre-trained encoders
sample_encoded = sample.copy()
for feature, encoder in ordinal_encoders.items():
    sample_encoded[feature] = encoder.transform(sample[[feature]])
    
sample_transformed = preprocessor.transform(sample_encoded)
prediction_encoded = model.predict(sample_transformed)
prediction = le.inverse_transform(prediction_encoded)

print(f"\nPredicted genre for sample: {prediction[0]}")


Test Accuracy: 0.3000
Training Accuracy: 0.8750

Predicted genre for sample: HipHop


We are going to explain the code step by step in detail.

1. Importing necessary libraries:

- pandas: for data manipulation and analysis.

- numpy: for numerical operations.

- DecisionTreeClassifier: a machine learning model for classification.

- OrdinalEncoder, OneHotEncoder, LabelEncoder: for encoding categorical variables.

- ColumnTransformer: to apply different transformations to different columns.

- train_test_split: to split the dataset into training and testing sets.

- accuracy_score: to evaluate the model's performance.

2. Loading the dataset:

- The dataset is loaded from an Excel file named 'MusicKeen.xlsx' into a DataFrame called `kmp`.

3. Defining features and target:

- `X` is the feature matrix, which includes all columns except 'genre'.

- `y` is the target vector, which is the 'genre' column.

4. Defining feature types:

- `numerical_features`: list of numerical features (here, only 'age').

- `categorical_features`: list of categorical features (here, 'gender' and 'obsession').

- `ordinal_features`: a dictionary specifying which categorical features are ordinal and their categories (here, 'obsession' has ordered categories: 'Low', 'Moderate', 'High').

5. Splitting the data:

- The data is split into training and testing sets (80% training, 20% testing) using `train_test_split`.

- `random_state=42` ensures reproducibility.

- `stratify=y` ensures that the class distribution in the splits is the same as in the original dataset.

6. Preprocessing pipeline:

- We create copies of the training and test sets to avoid modifying the original data during encoding.

7. Ordinal encoding for the 'obsession' feature:

- For each ordinal feature (here only 'obsession'), we:

a. Create an `OrdinalEncoder` with the specified categories.

b. Fit the encoder on the training data and transform the training feature.

c. Transform the test feature using the same encoder (without fitting again to avoid data leakage).

- The encoders are stored in `ordinal_encoders` for later use (e.g., on new samples).

8. Identifying remaining categorical features:

- The remaining categorical features (those not ordinal) are identified. Here, 'gender' is the remaining one.

9. Setting up the ColumnTransformer:

- The `ColumnTransformer` applies different preprocessing to different columns:

a. Numerical features ('age') are passed through without any change (using 'passthrough').

b. The remaining categorical features (here, 'gender') are one-hot encoded. `handle_unknown='ignore'` ensures that if the test set or a new sample has a category not seen during training, it will be ignored (resulting in all zeros for that category).

c. The ordinal features (here, 'obsession') are passed through (because we already encoded them).

10. Encoding the target variable:

- The target variable (genre) is encoded using `LabelEncoder` (if it's of string type).

a. The encoder is fitted only on the training target (`y_train`).

b. The training target is transformed to `y_train_encoded`.

c. The test target is transformed using the same encoder (without fitting) to `y_test_encoded`.

11. Applying the preprocessing to the features:

- The `preprocessor` (ColumnTransformer) is fitted on the training data (`train_encoded`) and then used to transform both the training and test data.

a. `fit_transform` is used for the training data to learn the parameters (like one-hot encoding categories) and transform.

b. `transform` is used for the test data (using the parameters learned from the training data) to avoid data leakage.

12. Model training:

- A `DecisionTreeClassifier` is instantiated with `random_state=42` for reproducibility.

- The model is trained on the preprocessed training data (`X_train_transformed`) and the encoded training labels (`y_train_encoded`).

13. Model evaluation:

- Predictions are made on the preprocessed test set (`X_test_transformed`), and the accuracy is computed by comparing to `y_test_encoded`.

- Similarly, predictions are made on the training set to compute training accuracy (to check for overfitting: high training accuracy and low test accuracy indicates overfitting).

14. Sample prediction:

- A sample data point (dictionary) is created and converted to a DataFrame.

- The sample is preprocessed using the same steps as the training data:

a. The ordinal feature ('obsession') is transformed using the stored ordinal encoder.

b. The entire sample is transformed using the pre-trained `preprocessor` (ColumnTransformer) which will handle the numerical feature and the one-hot encoding for 'gender'.

- The model predicts the encoded genre for the preprocessed sample.

- The encoded prediction is then inverse-transformed using the label encoder (`le`) to get the original genre label.

15. Printing results:

- The test accuracy and training accuracy are printed.

- The predicted genre for the sample is printed.

Important considerations:

- The code avoids data leakage by splitting the data before any preprocessing and by fitting the preprocessing steps (encoders, ColumnTransformer) only on the training data.

- The same preprocessing steps are applied to the test data and new samples without re-fitting.

- The use of `stratify=y` in the split ensures that the class distribution is maintained in both training and test sets.

1. Import Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
kmp = pd.read_excel('MusicKeen.xlsx')
X = kmp.drop(columns=['genre'])  # Features
y = kmp['genre']  # Target

Loads music preference data from Excel

Separates features (X) from target (y = genre)

2. Feature Specification

In [None]:
numerical_features = ['age']
categorical_features = ['gender', 'obsession']
ordinal_features = {'obsession': ['Low', 'Moderate', 'High']}

Classifies features by type:

Numerical: Continuous values (age)

Categorical: Discrete values (gender, obsession)

Ordinal: Categorical with inherent order (obsession levels)

3. Data Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Splits data BEFORE preprocessing to prevent leakage

80% training, 20% testing

stratify=y: Maintains genre proportion in splits

random_state=42: Ensures reproducibility

4. Preprocessing Pipeline

(a) Ordinal Encoding

ordinal_encoders = {}
for feature, categories in ordinal_features.items():
    encoder = OrdinalEncoder(categories=[categories])
    train_encoded[feature] = encoder.fit_transform(X_train[[feature]])
    test_encoded[feature] = encoder.transform(X_test[[feature]])

Converts obsession levels to numerical values (Low=0, Moderate=1, High=2)

Fit ONLY on training data to prevent test data leakage

(b) Column Transformation Setup

In [None]:
remaining_categoricals = ['gender']  # Non-ordinal categorical

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),          # Age: no transformation
        ('cat', OneHotEncoder(handle_unknown='ignore'),      # Gender: one-hot
        ('ord', 'passthrough', list(ordinal_features.keys())) # Obsession: already encoded
    ]
)

Handles different feature types separately:

Numerical features: Pass through unchanged

Categorical: One-hot encode gender (e.g., Male=[1,0], Female=[0,1])

Ordinal: Use pre-encoded values

(c) Target Encoding

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)  # Fit only on train
y_test_encoded = le.transform(y_test)        # Transform test

Converts genre strings to numerical labels

Preserves mapping for inverse transformation later

(d) Apply Transformations

In [None]:
X_train_transformed = preprocessor.fit_transform(train_encoded)
X_test_transformed = preprocessor.transform(test_encoded)

fit_transform(): Learns parameters from training data

transform(): Applies same transformation to test data

5. Model Training & Evaluation

In [None]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_transformed, y_train_encoded)

# Test evaluation
test_pred = model.predict(X_test_transformed)
test_accuracy = accuracy_score(y_test_encoded, test_pred)

# Training evaluation (overfitting check)
train_pred = model.predict(X_train_transformed)
train_accuracy = accuracy_score(y_train_encoded, train_pred)

Trains decision tree classifier

Reports both test accuracy (true performance) and training accuracy (overfitting indicator)

6. Sample Prediction

In [None]:
sample = {'age':40, 'gender':'Female', 'obsession':'Moderate'}

# Apply same preprocessing
for feature, encoder in ordinal_encoders.items():
    sample_encoded[feature] = encoder.transform(sample[[feature]])
sample_transformed = preprocessor.transform(sample_encoded)

# Predict and decode
prediction_encoded = model.predict(sample_transformed)
prediction = le.inverse_transform(prediction_encoded)

Preprocesses sample identically to training data

Uses stored encoders (no refitting)

Makes prediction

Converts numerical label back to genre name

Key Concepts Illustrated:
Data Leakage Prevention:

Splitting before preprocessing

Fitting encoders ONLY on training data

Using transform() (not fit_transform()) for test/sample data

Feature Engineering:

Different processing for different feature types

Ordinal vs nominal categorical handling

Target encoding with inverse capability

Model Evaluation:

Test accuracy: True performance metric

Training accuracy: Overfitting detector

Stratified sampling: Maintains class balance

Reproducibility:

random_state in both splitting and model

Storing encoders for consistent transformations

The pipeline ensures that new data (test data or real-world samples) undergoes identical preprocessing as the training data, maintaining model integrity and performance reliability