In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Read the tracks.csv file
tracks_df = pd.read_csv('tracks.csv')

# Filter columns (including 'name' and 'genre_top')
selected_columns = ["track_id", "listens", "title", "tracks", "type", "duration", "genre_top", "interest", "name"]
filtered_tracks_df = tracks_df[selected_columns]

# Remove rows with non-numeric track IDs
filtered_tracks_df = filtered_tracks_df[pd.to_numeric(filtered_tracks_df['track_id'], errors='coerce').notnull()]

# Convert track_id to numeric
filtered_tracks_df['track_id'] = pd.to_numeric(filtered_tracks_df['track_id'])

# Handle missing values for numeric columns
numeric_columns = ['listens', 'tracks', 'duration', 'interest']
imputer_numeric = SimpleImputer(strategy='mean')
filtered_tracks_df[numeric_columns] = imputer_numeric.fit_transform(filtered_tracks_df[numeric_columns])

# One-hot encode categorical feature 'genre_top'
genre_encoder = OneHotEncoder(sparse=False)
genre_encoded = genre_encoder.fit_transform(filtered_tracks_df[['genre_top']])
genre_encoded_df = pd.DataFrame(genre_encoded, columns=genre_encoder.get_feature_names_out(['genre_top']))
filtered_tracks_df = pd.concat([filtered_tracks_df, genre_encoded_df], axis=1)
filtered_tracks_df.drop(columns=['genre_top'], inplace=True)

# Standardize data
scaler = StandardScaler()
filtered_tracks_df[['listens', 'tracks', 'duration', 'interest']] = scaler.fit_transform(filtered_tracks_df[['listens', 'tracks', 'duration', 'interest']])

# Save filtered metadata to a new csv file
filtered_tracks_df.to_csv('filtered_metadata.csv', index=False)


  tracks_df = pd.read_csv('tracks.csv')
