# Music Data Analysis

This notebook analyzes multiple music datasets including tracks, genres, years, and artists.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 2. Load Datasets

### Step 1: Read the main dataset

In [None]:
# 1. Read the main dataset using pd.read_csv() and assign it to data
data = pd.read_csv('data.csv')
print(f"✓ Main dataset loaded: {len(data):,} rows")

### Step 2: Read the genre dataset

In [None]:
# 2. Read the genre dataset using pd.read_csv() and assign it to genre_data
genre_data = pd.read_csv('data_by_genres.csv')
print(f"✓ Genre dataset loaded: {len(genre_data):,} rows")

### Step 3: Read the year dataset

In [None]:
# 3. Read the year dataset using pd.read_csv() and assign it to year_data
year_data = pd.read_csv('data_by_year.csv')
print(f"✓ Year dataset loaded: {len(year_data):,} rows")

### Step 4: Read the artist dataset

In [None]:
# 4. Read the artist dataset using pd.read_csv() and assign it to artist_data
artist_data = pd.read_csv('data_by_artist.csv')
print(f"✓ Artist dataset loaded: {len(artist_data):,} rows")

## 3. Explore the Data

### Step 5: Display the first two rows of each dataset

In [None]:
# 5. Display the first two rows of data using head()
print("=" * 80)
print("MAIN DATASET (data) - First 2 rows")
print("=" * 80)
data.head(2)

In [None]:
# Display the first two rows of genre_data
print("=" * 80)
print("GENRE DATASET (genre_data) - First 2 rows")
print("=" * 80)
genre_data.head(2)

In [None]:
# Display the first two rows of year_data
print("=" * 80)
print("YEAR DATASET (year_data) - First 2 rows")
print("=" * 80)
year_data.head(2)

In [None]:
# Display the first two rows of artist_data
print("=" * 80)
print("ARTIST DATASET (artist_data) - First 2 rows")
print("=" * 80)
artist_data.head(2)

### Step 6: Retrieve information about datasets

In [None]:
# 6. Retrieve information about data using info()
print("=" * 80)
print("MAIN DATASET (data) - Information")
print("=" * 80)
data.info()

In [None]:
# Retrieve information about genre_data
print("=" * 80)
print("GENRE DATASET (genre_data) - Information")
print("=" * 80)
genre_data.info()

## 4. Feature Engineering

### Step 7: Create a decade column

In [None]:
# 7. Create a decade column in data representing the decade of each track using apply() and a lambda function
data['decade'] = data['year'].apply(lambda x: (x // 10) * 10)

print("✓ Decade column created successfully!")
print("\nFirst 10 rows with year and decade:")
data[['name', 'year', 'decade']].head(10)

In [None]:
# View decade distribution
print("Decade Distribution:")
decade_counts = data['decade'].value_counts().sort_index()
print(decade_counts)

# Visualize decade distribution
plt.figure(figsize=(12, 6))
decade_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Tracks by Decade', fontsize=16, fontweight='bold')
plt.xlabel('Decade', fontsize=12)
plt.ylabel('Number of Tracks', fontsize=12)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Summary Statistics

In [None]:
# Display summary statistics
print("=" * 80)
print("DATASET SUMMARY")
print("=" * 80)
print(f"Total tracks in main dataset: {len(data):,}")
print(f"Total genres: {len(genre_data):,}")
print(f"Total years covered: {len(year_data):,}")
print(f"Total artists: {len(artist_data):,}")
print(f"Year range: {data['year'].min()} - {data['year'].max()}")
print(f"Decade range: {data['decade'].min()} - {data['decade'].max()}")
print("\nUnique values:")
print(f"  - Unique artists: {data['artists'].nunique():,}")
print(f"  - Unique tracks: {data['name'].nunique():,}")
print(f"  - Unique IDs: {data['id'].nunique():,}")

## 6. Additional Exploration (Bonus)

In [None]:
# Display basic statistics for numerical columns
print("Numerical Statistics for Main Dataset:")
data.describe()

In [None]:
# Check for missing values
print("Missing Values in Main Dataset:")
print(data.isnull().sum())

In [None]:
# Top 10 most popular genres
print("Top 10 Most Popular Genres (by average popularity):")
top_genres = genre_data.nlargest(10, 'popularity')[['genres', 'popularity']]
print(top_genres)

In [None]:
# Visualize average audio features by decade
audio_features = ['acousticness', 'danceability', 'energy', 'valence']
decade_features = data.groupby('decade')[audio_features].mean()

plt.figure(figsize=(14, 8))
for feature in audio_features:
    plt.plot(decade_features.index, decade_features[feature], marker='o', label=feature, linewidth=2)

plt.title('Evolution of Audio Features by Decade', fontsize=16, fontweight='bold')
plt.xlabel('Decade', fontsize=12)
plt.ylabel('Average Value', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()