In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

class DataExplorer:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
    
    def load_data(self):
        """Loads the dataset from a CSV file."""
        self.data = pd.read_csv(self.file_path)
        return self.data
    
    def _check_data_loaded(self):
        """Helper method to ensure data is loaded before processing."""
        if self.data is None:
            raise ValueError("Data not loaded. Please call load_data() first.")

    def check_missing_values(self):
        """Checks for missing values in the dataset."""
        self._check_data_loaded()
        return self.data.isnull().sum()
    
    def describe_data(self):
        """Returns statistical description of the dataset."""
        self._check_data_loaded()
        return self.data.describe()
    
    def visualize_distribution(self, column):
        """Plots the distribution of a given column."""
        self._check_data_loaded()
        if column not in self.data.columns:
            raise KeyError(f"Column '{column}' not found in dataset.")
        
        plt.hist(self.data[column].dropna(), bins=20, edgecolor='black')
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.show()
    
    def bar_plot(self, column):
        """Creates a bar plot for categorical variables."""
        self._check_data_loaded()
        if column not in self.data.columns:
            raise KeyError(f"Column '{column}' not found in dataset.")
        
        plt.figure(figsize=(10, 5))
        sns.countplot(x=self.data[column], palette='viridis')
        plt.title(f'Bar Plot of {column}')
        plt.xticks(rotation=45)
        plt.show()
    
    def box_plot(self, column):
        """Creates a box plot for numerical variables."""
        self._check_data_loaded()
        if column not in self.data.columns:
            raise KeyError(f"Column '{column}' not found in dataset.")
        
        plt.figure(figsize=(8, 5))
        sns.boxplot(y=self.data[column], palette='coolwarm')
        plt.title(f'Box Plot of {column}')
        plt.show()
    
    def scatter_plot(self, x_col, y_col):
        """Creates a scatter plot to find relationships between two numerical variables."""
        self._check_data_loaded()
        if x_col not in self.data.columns or y_col not in self.data.columns:
            raise KeyError(f"Columns '{x_col}' or '{y_col}' not found in dataset.")
        
        plt.figure(figsize=(8, 5))
        sns.scatterplot(x=self.data[x_col], y=self.data[y_col], alpha=0.7)
        plt.title(f'Scatter Plot of {x_col} vs {y_col}')
        plt.show()

class AdditionalStatistics:
    """Provides additional statistical measures for a given dataset."""
    
    def __init__(self, df):
        if df is None or df.empty:
            raise ValueError("DataFrame is empty or not loaded.")
        self.df = df

    def _check_column_exists(self, column_name):
        """Helper method to check if column exists."""
        if column_name not in self.df.columns:
            raise KeyError(f"Column '{column_name}' not found in dataset.")
    
    def mean(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].mean()

    def median(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].median()

    def std(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].std()

    def variance(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].var()

    def minimum(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].min()

    def maximum(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].max()

    def skewness(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].skew()

    def kurtosis(self, column_name):
        self._check_column_exists(column_name)
        return self.df[column_name].kurtosis()


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

class DataPreprocessor:
    def __init__(self, data, genres_data=None):
        if not isinstance(data, pd.DataFrame):
            raise ValueError("Input data must be a pandas DataFrame.")
        self.data = data.copy()
        self.genres_data = genres_data.copy() if genres_data is not None else None
    
    def handle_missing_values(self, columns=None):
        """Handles missing values by filling numeric columns with their mean (or specified columns)."""
        if columns:
            self.data[columns] = self.data[columns].fillna(self.data[columns].mean())
        else:
            numeric_cols = self.data.select_dtypes(include=[np.number]).columns
            self.data[numeric_cols] = self.data[numeric_cols].fillna(self.data[numeric_cols].mean())
        return self.data
    
    def convert_dtypes(self):
        """Converts data types for consistency."""
        self.data['year'] = self.data['year'].astype(int)
        self.data['explicit'] = self.data['explicit'].astype(bool)
        return self.data
    
    def merge_genres(self):
        """Merges the main dataset with the genres dataset on 'id', if available."""
        if self.genres_data is not None and 'id' in self.data.columns and 'id' in self.genres_data.columns:
            self.data = self.data.merge(self.genres_data, on='id', how='left')
        return self.data
    
    def split_data(self, target_column, test_size=0.2):
        """Splits the dataset into training and testing sets."""
        if target_column not in self.data.columns:
            raise KeyError(f"Target column '{target_column}' not found in dataset.")
        
        X = self.data.drop(columns=[target_column])
        y = self.data[target_column]
        return train_test_split(X, y, test_size=test_size, random_state=42)


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class RecommendationSystem:
    def __init__(self, data, feature_columns):
        self.data = data
        self.feature_columns = feature_columns
    

    def compute_similarity(self, target_row, chunkSize=None):
        """Computes similarity between songs based on selected features."""
        
        features = self.data[self.feature_columns].head(chunkSize)

        # Normalize the data
        normalized_data = normalize(features)

        # Convert features to sparse matrix (if necessary)
        sparse_matrix = csr_matrix(normalized_data)

        # Ensure we are using float32 for memory efficiency
        data = sparse_matrix.astype(np.float32)

        # Ensure chunk_data is always defined
        chunk_data = data  # Default to the full dataset in case chunkSize condition is not met

        if chunkSize and data.shape[0] > chunkSize:
            start_idx = (target_row // chunkSize) * chunkSize
            end_idx = min(start_idx + chunkSize, data.shape[0])
            chunk_data = data[start_idx:end_idx]

            target_vector = data[target_row]  # Extract the target row

            # Compute similarity only for the chunk
            similarity_scores = cosine_similarity(chunk_data, target_vector, dense_output=False).toarray().flatten()
        else:
            # Compute similarity for the entire dataset
            target_vector = data[target_row]
            similarity_scores = cosine_similarity(chunk_data, target_vector, dense_output=False).toarray().flatten()

        return similarity_scores

    
    def recommend(self, song_index, top_n=5):
        """Recommends top N similar songs."""
        similarity_matrix = self.compute_similarity(song_index)

        top_n_indices = np.argpartition(similarity_matrix, -top_n)[-top_n:]
        top_n_indices = top_n_indices[np.argsort(similarity_matrix[top_n_indices])[::-1]]
        similar_songs = np.argsort(similarity_matrix[top_n_indices])[::-1]
        return self.data.iloc[top_n_indices]

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import random

class SongClustering:
    def __init__(self, df, feature_columns, name_column="artists", n_clusters=3):
        """
        Initialize the SongClustering class.

        :param df: Pandas DataFrame containing song data.
        :param feature_columns: List of feature columns to use for clustering.
        :param name_column: Column name for artist names (default: 'artists').
        :param n_clusters: Number of clusters for KMeans (default: 3).
        """
        self.df = df.copy()  # Keep full dataset for recommendations
        self.data = df[feature_columns].copy()  # Only feature columns for clustering
        self.name_column = name_column
        self.n_clusters = n_clusters

    def split_features(self, test_size=0.2, random_state=42):
        """
        Splits feature data into training and testing sets.
        """
        return train_test_split(self.data, test_size=test_size, random_state=random_state)

    def create_clusters(self):
        """
        Applies KMeans clustering using selected features.
        Assigns clusters to the full dataset (`self.df`).
        """
        kmeans = KMeans(n_clusters=self.n_clusters, random_state=100, n_init=10)
        self.df["cluster"] = kmeans.fit_predict(self.data)  # Use feature data for clustering
        self.data["cluster"] = self.df["cluster"]  # Add cluster labels to feature DataFrame

    def visualize_clusters(self, based_on=["popularity", "tempo"]):
        """
        Visualizes clusters using a scatter plot.

        :param based_on: List containing two feature names for visualization.
        """
        if len(based_on) != 2:
            raise ValueError("based_on should contain exactly two feature names.")

        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=self.df[based_on[0]], y=self.df[based_on[1]], 
                        hue=self.df['cluster'], palette='viridis', alpha=0.7)
        plt.title("K-Means Clustering Visualization")
        plt.show()

    def get_recommendations_by_cluster(self, artist_name, n=5):
        """
        Recommends songs from the same cluster as the given artist.

        :param artist_name: Artist name for whom to find recommendations.
        :param n: Number of recommendations to return (default: 5).
        """
        target_index = self.get_random_artist_index_by_name(artist_name)
       
        
        if target_index is None:
            return None

        target_cluster = self.df.iloc[target_index]["cluster"]
        recommended_songs = self.df[self.df["cluster"] == target_cluster]

        # Filter based on popularity range
        target_popularity = self.df.iloc[target_index]["popularity"]
        filtered = recommended_songs[
            (recommended_songs["popularity"] >= target_popularity - 10) &
            (recommended_songs["popularity"] <= target_popularity + 10)
        ]

        # If not enough recommendations, add extra
        if len(filtered) < n:
            additional = recommended_songs[~recommended_songs.index.isin(filtered.index)]
            filtered = pd.concat([filtered, additional.head(n - len(filtered))])
            
        return filtered.head(n)
    
    def get_random_artist_index_by_name(self, artist_name):
        """
        Finds an index of a random song by a given artist.
        """
        # Use self.df since self.data does not contain the 'artists' column
        artists=  self.df[self.name_column].tolist()

        indexes = []

        # find all indexes of artists that contain the name
        for x in range(len(artists)):
            if artist_name.lower() in str(artists[x]).lower():
                indexes.append(x)

        # if no matches found, return None
        if len(indexes) == 0:
            return None
        else:
            # return a random index from the found indexes
            return indexes[random.randint(0, len(indexes))]



In [1]:
import DataLoadingAndExploration as etl

In [2]:
wrangler = etl.DataExplorer("./data.csv")

In [3]:
data =  wrangler.load_data()

In [4]:
len(data)

170653

In [5]:
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [6]:
wrangler.check_missing_values()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

In [7]:
wrangler.describe_data()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
count,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0,170653.0
mean,0.528587,1976.787241,0.502115,0.537396,230948.3,0.482389,0.084575,0.16701,5.199844,0.205839,-11.46799,0.706902,31.431794,0.098393,116.86159
std,0.263171,25.917853,0.376032,0.176138,126118.4,0.267646,0.278249,0.313475,3.515094,0.174805,5.697943,0.455184,21.826615,0.16274,30.708533
min,0.0,1921.0,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0
25%,0.317,1956.0,0.102,0.415,169827.0,0.255,0.0,0.0,2.0,0.0988,-14.615,0.0,11.0,0.0349,93.421
50%,0.54,1977.0,0.516,0.548,207467.0,0.471,0.0,0.000216,5.0,0.136,-10.58,1.0,33.0,0.045,114.729
75%,0.747,1999.0,0.893,0.668,262400.0,0.703,0.0,0.102,8.0,0.261,-7.183,1.0,48.0,0.0756,135.537
max,1.0,2020.0,0.996,0.988,5403500.0,1.0,1.0,1.0,11.0,1.0,3.855,1.0,100.0,0.97,243.507


In [8]:
wrangler.additional_statistics()

KeyboardInterrupt: 

In [None]:
wra