# Random Forest Classification for Mood Classification of Spotify Songs

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Reading Data

In [2]:
train_df=pd.read_csv("final_train.csv")
train_df.head()

Unnamed: 0,mood,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,timeSignature
0,calm,0.949,0.735,0.456,0.878,0.118,-12.937,0.0524,147.97,0.708,4
1,calm,0.0859,0.773,0.291,0.898,0.117,-15.935,0.0781,80.006,0.601,4
2,calm,0.374,0.636,0.438,0.837,0.873,-12.56,0.0707,149.981,0.915,4
3,calm,0.842,0.768,0.574,0.926,0.115,-7.573,0.0578,74.997,0.299,4
4,calm,0.488,0.766,0.238,0.935,0.124,-11.831,0.0445,105.0,0.394,4


## Cleaning Data

In [3]:
train_df.columns

Index(['mood', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence',
       'timeSignature'],
      dtype='object')

In [9]:
train_df['mood'] = train_df['mood'].map({'calm': 0, 'happy': 1, 'sad': 2, 'energetic': 3})
y=train_df['mood']
X=train_df.drop(['mood'], axis=1)
X.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,timeSignature
0,0.949,0.735,0.456,0.878,0.118,-12.937,0.0524,147.97,0.708,4
1,0.0859,0.773,0.291,0.898,0.117,-15.935,0.0781,80.006,0.601,4
2,0.374,0.636,0.438,0.837,0.873,-12.56,0.0707,149.981,0.915,4
3,0.842,0.768,0.574,0.926,0.115,-7.573,0.0578,74.997,0.299,4
4,0.488,0.766,0.238,0.935,0.124,-11.831,0.0445,105.0,0.394,4


In [10]:
X['timeSignature'].value_counts()

timeSignature
4    1380
3      79
5      12
1      12
0       1
Name: count, dtype: int64

In [11]:
X=X.drop(['timeSignature'], axis=1)

### Standardization

In [12]:
X=(X-X.mean())/X.std()
X.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,1.64424,0.850438,-0.361958,1.937121,-0.385616,-1.06439,-0.336563,1.081026,1.048253
1,-0.905543,1.128964,-1.030623,1.994756,-0.393167,-1.79577,-0.027024,-1.23224,0.61519
2,-0.054433,0.124806,-0.434904,1.818968,5.31528,-0.972418,-0.116152,1.149473,1.886047
3,1.328139,1.092316,0.116239,2.075446,-0.408268,0.244191,-0.271524,-1.402729,-0.6071
4,0.282347,1.077656,-1.245407,2.101382,-0.340311,-0.794574,-0.431713,-0.381528,-0.222605


### Cross validation function for evaluating the models

In [14]:
def cross_validation(X, y, model, n_folds=5):
    """
    Perform cross-validation.

    Parameters:
    - X: Features array
    - y: Target array
    - model: Model object or function to train and evaluate
    - n_folds: Number of folds for cross-validation (default=5)

    Returns:
    - cv_scores: Array of cross-validation scores
    """
    n_samples = len(y)
    fold_size = n_samples // n_folds
    cv_scores = []

    # Shuffle indices
    indices = np.random.permutation(n_samples)

    for i in range(n_folds):
        # Determine the indices for the validation fold
        val_indices = indices[i * fold_size: (i + 1) * fold_size]

        # Use the remaining indices for training
        train_indices = np.concatenate((indices[:i * fold_size], indices[(i + 1) * fold_size:]))

        # Split the data into training and validation sets
        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        # Train the model
        model.fit(X_train, y_train)

        # Evaluate the model on the validation set
        score = evaluate_model(model, X_val, y_val)

        cv_scores.append(score)

    return np.array(cv_scores)

# Example of a simple evaluation function
def evaluate_model(model, X, y):
    """
    Evaluate the model on the validation set.

    Parameters:
    - model: Trained model object or function
    - X: Features array of the validation set
    - y: Target array of the validation set

    Returns:
    - score: Evaluation score (e.g., accuracy, MSE, etc.)
    """
    # Make predictions
    y_pred = model.predict(X)

    # Calculate accuracy (you can replace it with any other evaluation metric)
    accuracy = np.mean(y_pred == y)
    
    return accuracy

# Implementing the model from scratch

## Decision Tree and Random Forest with Gini Impurity

In [13]:
import numpy as np
import pandas as pd

class DecisionTree:
    def __init__(self, max_depth=None, max_features=None):
        self.max_depth = max_depth
        self.max_features = max_features
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y, depth=0)

    def _grow_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if (depth == self.max_depth) or (n_classes == 1) or (n_samples < 2):
            return {'class': np.argmax(np.bincount(y)), 'split_feature': None, 'split_threshold': None,
                    'left': None, 'right': None}

        # Select random subset of features
        if self.max_features is not None and self.max_features < n_features:
            feature_idxs = np.random.choice(n_features, self.max_features, replace=False)
            X = X[:, feature_idxs]
        else:
            feature_idxs = np.arange(n_features)

        # Find best split
        best_gini = float('inf')
        best_split_feature, best_split_threshold = None, None
        for feature in feature_idxs:
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_idxs = X[:, feature] < threshold
                y_left = y[left_idxs]
                y_right = y[~left_idxs]
                gini = (len(y_left) / n_samples) * self._gini_impurity(y_left) + \
                       (len(y_right) / n_samples) * self._gini_impurity(y_right)
                if gini < best_gini:
                    best_gini = gini
                    best_split_feature = feature
                    best_split_threshold = threshold

        # Split the dataset
        left_idxs = X[:, best_split_feature] < best_split_threshold
        right_idxs = ~left_idxs
        if np.sum(left_idxs) == 0:
            return {'class': np.argmax(np.bincount(y[right_idxs])), 'split_feature': None, 'split_threshold': None,
                    'left': None, 'right': None}

        # Check if right subtree is empty
        if np.sum(right_idxs) == 0:
            return {'class': np.argmax(np.bincount(y[left_idxs])), 'split_feature': None, 'split_threshold': None,
                    'left': None, 'right': None}
        
        # Grow left and right subtrees
        left_subtree = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_subtree = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)

        return {'class': None, 'split_feature': best_split_feature, 'split_threshold': best_split_threshold,
                'left': left_subtree, 'right': right_subtree}

    def _gini_impurity(self, y):
        _, counts = np.unique(y, return_counts=True)
        probas = counts / len(y)
        return 1 - np.sum(probas ** 2)

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, tree):
        if tree['class'] is not None:
            return tree['class']
        if x[tree['split_feature']] < tree['split_threshold']:
            return self._predict_tree(x, tree['left'])
        else:
            return self._predict_tree(x, tree['right'])

class RandomForest:
    def __init__(self, n_trees=100, max_depth=None, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            indices = np.random.choice(len(X), size=len(X), replace=True)
            tree = DecisionTree(max_depth=self.max_depth, max_features=self.max_features)
            tree.fit(X[indices], y.iloc[indices])
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        predictions=predictions.T
        predictions = np.array([np.bincount(pred).argmax() for pred in predictions])
        return predictions

### Decision tree Prediction and Analysis

We have tried to make predictions for different depth of trees, each of which captures different information. Hence it leads to varying accuracy of the model.

In [15]:
dt=DecisionTree(max_depth=5)
cv_scores = cross_validation(X.values, y, dt, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.6722973  0.65202703 0.64527027 0.71959459 0.64864865]
Mean CV score: 0.6675675675675675


In [21]:
dt=DecisionTree(max_depth=6)
cv_scores = cross_validation(X.values, y, dt, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.63175676 0.66216216 0.68918919 0.66216216 0.69594595]
Mean CV score: 0.6682432432432432


In [17]:
dt=DecisionTree(max_depth=7)
cv_scores = cross_validation(X.values, y, dt, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.63851351 0.68243243 0.68243243 0.6722973  0.62837838]
Mean CV score: 0.6608108108108108


### Random Forest Prediction and Analysis

In [18]:
rf = RandomForest(n_trees=10, max_depth=5)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.68581081 0.75       0.75337838 0.70945946 0.73648649]
Mean CV score: 0.7270270270270272


In [35]:
rf = RandomForest(n_trees=15, max_depth=5)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.66554054 0.70608108 0.74324324 0.80405405 0.76351351]
Mean CV score: 0.7364864864864865


In [19]:
rf = RandomForest(n_trees=10, max_depth=6)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.68918919 0.78378378 0.76689189 0.77702703 0.81418919]
Mean CV score: 0.7662162162162163


In [20]:
rf = RandomForest(n_trees=10, max_depth=7)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.68243243 0.77364865 0.83108108 0.85472973 0.82432432]
Mean CV score: 0.7932432432432434


### Decision Tree and Random Forest using Entropy as a measure of impurity

In [28]:
class DecisionTree:
    def __init__(self, max_depth=None, max_features=None):
        self.max_depth = max_depth
        self.max_features = max_features
        self.tree = None

    def fit(self, X, y):
        self.tree = self._grow_tree(X, y, depth=0)

    def _grow_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if (depth == self.max_depth) or (n_classes == 1) or (n_samples < 2):
            return {'class': np.argmax(np.bincount(y)), 'split_feature': None, 'split_threshold': None,
                    'left': None, 'right': None}

        # Select random subset of features
        if self.max_features is not None and self.max_features < n_features:
            feature_idxs = np.random.choice(n_features, self.max_features, replace=False)
            X = X[:, feature_idxs]
        else:
            feature_idxs = np.arange(n_features)

        # Find best split
        best_entropy = float('inf')
        best_split_feature, best_split_threshold = None, None
        for feature in feature_idxs:
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_idxs = X[:, feature] < threshold
                y_left = y[left_idxs]
                y_right = y[~left_idxs]
                entropy = (len(y_left) / n_samples) * self._entropy(y_left) + \
                          (len(y_right) / n_samples) * self._entropy(y_right)
                if entropy < best_entropy:
                    best_entropy = entropy
                    best_split_feature = feature
                    best_split_threshold = threshold

        # Split the dataset
        left_idxs = X[:, best_split_feature] < best_split_threshold
        right_idxs = ~left_idxs
        if np.sum(left_idxs) == 0:
            return {'class': np.argmax(np.bincount(y[right_idxs])), 'split_feature': None, 'split_threshold': None,
                    'left': None, 'right': None}

        # Check if right subtree is empty
        if np.sum(right_idxs) == 0:
            return {'class': np.argmax(np.bincount(y[left_idxs])), 'split_feature': None, 'split_threshold': None,
                    'left': None, 'right': None}
        
        # Grow left and right subtrees
        left_subtree = self._grow_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_subtree = self._grow_tree(X[right_idxs], y[right_idxs], depth + 1)

        return {'class': None, 'split_feature': best_split_feature, 'split_threshold': best_split_threshold,
                'left': left_subtree, 'right': right_subtree}

    def _entropy(self, y):
        _, counts = np.unique(y, return_counts=True)
        probas = counts / len(y)
        return -np.sum(probas * np.log2(probas + 1e-10))

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, tree):
        if tree['class'] is not None:
            return tree['class']
        if x[tree['split_feature']] < tree['split_threshold']:
            return self._predict_tree(x, tree['left'])
        else:
            return self._predict_tree(x, tree['right'])

class RandomForest:
    def __init__(self, n_trees=100, max_depth=None, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            indices = np.random.choice(len(X), size=len(X), replace=True)
            tree = DecisionTree(max_depth=self.max_depth, max_features=self.max_features)
            tree.fit(X[indices], y.iloc[indices])
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        predictions = predictions.T
        predictions = np.array([np.bincount(pred).argmax() for pred in predictions])
        return predictions


## Decision tree Prediction and Analysis

In [29]:
dt=DecisionTree(max_depth=5)
cv_scores = cross_validation(X.values, y, dt, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.68581081 0.65540541 0.66891892 0.7027027  0.63175676]
Mean CV score: 0.668918918918919


In [30]:
dt=DecisionTree(max_depth=6)
cv_scores = cross_validation(X.values, y, dt, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.66891892 0.67905405 0.61486486 0.64527027 0.61824324]
Mean CV score: 0.6452702702702704


In [31]:
dt=DecisionTree(max_depth=7)
cv_scores = cross_validation(X.values, y, dt, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.65540541 0.63175676 0.6722973  0.64864865 0.6722973 ]
Mean CV score: 0.6560810810810811


## Random Forest Prediction and Analysis

In [32]:
rf = RandomForest(n_trees=10, max_depth=5)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.7027027  0.72635135 0.76013514 0.78716216 0.72972973]
Mean CV score: 0.7412162162162163


In [33]:
rf = RandomForest(n_trees=10, max_depth=6)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.69256757 0.72972973 0.77027027 0.76689189 0.78378378]
Mean CV score: 0.7486486486486487


In [34]:
rf = RandomForest(n_trees=10, max_depth=7)
cv_scores = cross_validation(X.values, y, rf, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.6722973  0.81756757 0.82094595 0.7972973  0.83445946]
Mean CV score: 0.7885135135135135


# Scikit-Learn Library Implementation

## Decision Tree

In [22]:
from sklearn import tree
dt_sk = tree.DecisionTreeClassifier(max_depth=5)
cv_scores = cross_validation(X.values, y, dt_sk, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.68581081 0.66216216 0.67905405 0.63175676 0.68581081]
Mean CV score: 0.6689189189189191


In [23]:
from sklearn import tree
dt_sk = tree.DecisionTreeClassifier(max_depth=6)
cv_scores = cross_validation(X.values, y, dt_sk, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.66216216 0.68581081 0.65540541 0.65540541 0.66891892]
Mean CV score: 0.6655405405405406


In [24]:
from sklearn import tree
dt_sk = tree.DecisionTreeClassifier(max_depth=7)
cv_scores = cross_validation(X.values, y, dt_sk, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.63851351 0.60135135 0.64189189 0.63851351 0.61148649]
Mean CV score: 0.6263513513513513


## Random forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
rf_sk = RandomForestClassifier(n_estimators=10,max_depth=5)
cv_scores = cross_validation(X.values, y, rf_sk, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.7027027  0.67905405 0.68581081 0.68918919 0.72635135]
Mean CV score: 0.6966216216216216


In [26]:
from sklearn.ensemble import RandomForestClassifier
rf_sk = RandomForestClassifier(n_estimators=10, max_depth=6)
cv_scores = cross_validation(X.values, y, rf_sk, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.72972973 0.70945946 0.67905405 0.66216216 0.72972973]
Mean CV score: 0.702027027027027


In [27]:
from sklearn.ensemble import RandomForestClassifier
rf_sk = RandomForestClassifier(n_estimators=10, max_depth=7)
cv_scores = cross_validation(X.values, y, rf_sk, n_folds=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

Cross-validation scores: [0.69594595 0.64864865 0.66891892 0.76013514 0.67905405]
Mean CV score: 0.6905405405405405
