In [None]:
import pandas as pd
from collections import Counter

# Step 1: Load the datasets
df_charts = pd.read_csv("charts.csv")
df_musicinfo = pd.read_csv("MusicInfo.csv")


In [None]:
# Step 2: Data Cleaning
# Convert column names to lowercase for consistency
df_charts.columns = df_charts.columns.str.lower()
df_musicinfo.columns = df_musicinfo.columns.str.lower()

# Remove unnecessary columns
df_charts.drop(columns=["date"], inplace=True, errors="ignore")  # Remove 'date' from Charts
df_musicinfo.drop(columns=["duration_ms", "danceability", "energy", "key", "loudness", 
                           "mode", "speechiness", "acousticness", "instrumental", "liveness", 
                           "valence", "tempo", "time_signature", "tags", "instrumentalness", 
                           "spotify_id"], inplace=True, errors="ignore")  # Remove multiple columns from MusicInfo

In [None]:
# Remove duplicates based on (title, artist) for Charts & (name, artist) for MusicInfo
df_charts.drop_duplicates(subset=["title", "artist"], keep="first", inplace=True)
df_musicinfo.drop_duplicates(subset=["name", "artist"], keep="first", inplace=True)

# Convert 'streams' column to integer, setting invalid values to NaN
df_charts["streams"] = pd.to_numeric(df_charts["streams"], errors="coerce")


In [None]:
# Fix missing values: Drop rows where key identifiers are missing
df_charts.dropna(subset=["title", "artist", "streams"], inplace=True)
df_musicinfo.dropna(subset=["name", "artist"], inplace=True)

# Fix genre: Keep only the first genre (avoiding complex junction table for now)
df_musicinfo["genre"] = df_musicinfo["genre"].apply(lambda x: x.split(",")[0] if isinstance(x, str) else x)

In [None]:
# Step 3: Join datasets (Keep only common songs)
df_merged = pd.merge(df_charts, df_musicinfo, left_on=["title", "artist"], right_on=["name", "artist"], how="inner")

In [None]:
# Step 4: Data Transformation - Extract relevant columns
df_analysis = df_merged[["title", "artist", "streams", "region", "genre", "year", "chart"]]

In [None]:


df_charts.to_csv("charts_cleaned.csv", index=False)
df_musicinfo.to_csv("MusicInfo_cleaned.csv", index=False)
df_merged.to_csv("merged_data.csv", index=False)

print("\n**Data Cleaning, Processing & Analysis Completed Successfully!**")
