In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Read CSV
df = pd.read_csv("../music_genre.csv")

# Drop features with low correlation or high correlation with other features
df = df.drop([
    "instance_id",
    "track_name",
    "obtained_date",
    "energy",
    "key",
    "duration_ms",
    "mode"
    ], axis=1).replace('?', float('NaN')).dropna(how='any')

# Encode categorical values
le = LabelEncoder()
df['music_genre'] = le.fit_transform(df['music_genre'])
genre_mapping = dict(zip(le.classes_, range(len(le.classes_)))) # store category mapping needed for displaying data later
print('Genre mapping: ', genre_mapping)
df['artist_num'] = le.fit_transform(df['artist_name'])

# Make 'artist_genre' column
for row in df.index:
        artist_genre = np.unique(df[df['artist_num'] == df.loc[row,'artist_num']]['music_genre'])
        df.loc[row,'artist_genre'] = str(artist_genre)

# Delete temporary 'artist_num' and 'artist_name' columns and create new 'artist_genre' feature
df = df.drop(['artist_name', 'artist_num'], axis=1)
df['artist_genre'] = le.fit_transform(df['artist_genre'])

# Standardize each column for SVM training
scaler = StandardScaler()
for column in df.columns:
    if column == 'music_genre' or df[column].dtype not in ['int64', 'float64']:
        continue
    df[column] = scaler.fit_transform(df[[column]])

# create csv file
df.to_csv('../preprocessed_data.csv', index=False)

Genre mapping:  {'Alternative': 0, 'Anime': 1, 'Blues': 2, 'Classical': 3, 'Country': 4, 'Electronic': 5, 'Hip-Hop': 6, 'Jazz': 7, 'Rap': 8, 'Rock': 9}


In [30]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets for model training/testing
X_train, X_test, y_train, y_test = train_test_split(df.drop(['music_genre'], axis=1), df['music_genre'], test_size=0.2, random_state=42)
X_train.to_csv('../train_test_split_data/X_train.csv', index=False)
X_test.to_csv('../train_test_split_data/X_test.csv', index=False)
y_train.to_csv('../train_test_split_data/y_train.csv', index=False)
y_test.to_csv('../train_test_split_data/y_test.csv', index=False)

In [31]:
import pandas as pd

# Split up X_test and y_test into smaller chunks for web app performance
X_test = pd.read_csv('../train_test_split_data/X_test.csv')
y_test = pd.read_csv('../train_test_split_data/y_test.csv')

# Define the chunk size
chunk_size = 100

# Calculate the number of chunks needed
num_chunks = len(X_test) // chunk_size + (1 if len(X_test) % chunk_size else 0)

# Iterate over the DataFrames in chunks and save each chunk to a new CSV file
for i in range(1, num_chunks + 1):
    start = (i - 1) * chunk_size
    end = start + chunk_size

    X_chunk = X_test.iloc[start:end]
    y_chunk = y_test.iloc[start:end]

    X_chunk.to_csv(f'../test_datasets/X_test_{i}.csv', index=False)
    y_chunk.to_csv(f'../test_datasets/y_test_{i}.csv', index=False)
