In [None]:
# Spotify Music Recommendation System
### Data Exploration and Preprocessing

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load dataset
data_path = "..\data\clean_data.json"
data = pd.read_json(data_path)

# Initial inspection
print("Data Sample:")
print(data.head())
print("\nData Info:")
print(data.info())

# Handle missing values and duplicates
missing_values = data.isnull().sum()
print("\nMissing Values:\n", missing_values)

# Convert timestamp to datetime
data['ts'] = pd.to_datetime(data['ts'])

# Remove duplicates
data = data.drop_duplicates()
print("\nData Shape after Cleanup:", data.shape)

# Extract date and hour
data['date'] = data['ts'].dt.date
data['hour'] = data['ts'].dt.hour

# Filter tracks with play duration > 30 seconds
data = data[data['ms_played'] > 30000]

# Calculate track play counts
track_play_counts = data.groupby('master_metadata_track_name')['ms_played'].count().reset_index()
track_play_counts.columns = ['track_name', 'play_count']

# Merge play counts back into the dataset
processed_data = data.merge(track_play_counts, on='master_metadata_track_name', how='left')

# Save the cleaned and processed dataset
processed_data_path = "../data/cleaned_data.csv"
processed_data.to_csv(processed_data_path, index=False)
print(f"\nProcessed data saved to {processed_data_path}")

# Visualize play count distribution
plt.figure(figsize=(10, 5))
sns.histplot(track_play_counts['play_count'], bins=50, color='skyblue')
plt.title("Track Play Count Distribution")
plt.xlabel("Play Count")
plt.ylabel("Frequency")
plt.show()


: 