In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing Libraries**

In [3]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical tests
from scipy import stats

# Machine learning models and evaluation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_squared_error, r2_score, roc_auc_score, roc_curve
)

# Ignore warnings (optional)
import warnings
warnings.filterwarnings('ignore')

# Display settings
%matplotlib inline
plt.style.use('seaborn')

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

**Loading the Dataset**

In [None]:
# Load the datasets
df_low = pd.read_csv('/kaggle/input/spotify-music-dataset/low_popularity_spotify_data.csv')
df_high = pd.read_csv('/kaggle/input/spotify-music-dataset/high_popularity_spotify_data.csv')

# Add a label to identify low and high popularity songs
df_low['Popularity_Label'] = 'Low'
df_high['Popularity_Label'] = 'High'

# Combine the datasets
df = pd.concat([df_low, df_high], ignore_index=True)

# Display the first few rows
df.head()

**3 Data Exploration and Preprocessing**

In [None]:
# Check the shape of the combined dataset
print(f'Total records: {df.shape[0]}')
print(f'Total features: {df.shape[1]}')

# View summary information
df.info()

** Handling Missing Values**

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
# Calculate the percentage of missing values
missing_percent = (df.isnull().sum() / len(df)) * 100
print(missing_percent)

In [None]:
# Remove rows with missing values
df_cleaned = df.dropna()

# Verify that missing values are gone
print(df_cleaned.isnull().sum())


In [None]:
# Visualize missing values using a heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df_cleaned.isnull(), cbar=False)
plt.title('Missing Values Heatmap')
plt.show()


In [None]:
# Drop columns with more than 50% missing values (if any)
threshold = 0.5
df_cleaned = df_cleaned.loc[:, df_cleaned.isnull().mean() < threshold]

# Impute missing numerical values with median
numerical_features = df_cleaned.select_dtypes(include=[np.number]).columns.tolist()
for feature in numerical_features:
    df_cleaned[feature].fillna(df_cleaned[feature].median(), inplace=True)

# Impute missing categorical values with mode
categorical_features = df.select_dtypes(include=[object]).columns.tolist()
for feature in categorical_features:
    df[feature].fillna(df[feature].mode()[0], inplace=True)

**Data Type Conversion**

In [None]:
# Convert 'track_album_release_date' to datetime
df_cleaned['track_album_release_date'] = pd.to_datetime(df_cleaned['track_album_release_date'], errors='coerce')

# Extract year, month, and day from 'track_album_release_date'
df_cleaned['Release Year'] = df_cleaned['track_album_release_date'].dt.year
df_cleaned['Release Month'] = df_cleaned['track_album_release_date'].dt.month
df_cleaned['Release Day'] = df_cleaned['track_album_release_date'].dt.day
# Handle missing values in the new date features if necessary
df_cleaned['Release Year'].fillna(df_cleaned['Release Year'].median(), inplace=True)
df_cleaned['Release Month'].fillna(df_cleaned['Release Month'].median(), inplace=True)
df_cleaned['Release Day'].fillna(df_cleaned['Release Day'].median(), inplace=True)

# Convert 'key' and 'mode' to categorical
df_cleaned['key'] = df_cleaned['key'].astype('category')
df_cleaned['mode'] = df_cleaned['mode'].astype('category')

# Update lists of numerical and categorical features
numerical_features = df_cleaned.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()

# Exclude non-feature columns
non_feature_columns = [
    'track_name', 'track_artist', 'track_album_name', 'track_id', 'track_album_id',
    'playlist_name', 'playlist_id', 'track_popularity', 'Popularity_Label', 'id', 
    'track_href', 'analysis_url', 'uri', 'type', 'track_album_release_date'
]

# Define features
features = df_cleaned.columns.drop(non_feature_columns, errors='ignore').tolist()

** Feature Engineering**

In [None]:
# Encoding categorical variables using One-Hot Encoding
categorical_features_to_encode = ['key', 'mode', 'playlist_genre', 'playlist_subgenre']

df_encoded = pd.get_dummies(df_cleaned, columns=categorical_features_to_encode, drop_first=True)

# Update feature list after encoding
features = [feature for feature in df_encoded.columns if feature not in non_feature_columns + ['Popularity_Label', 'Popular']]

# Scaling numerical features
scaler = StandardScaler()
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])


**Exploratory Data Analysis (EDA)**

In [None]:
# List of numerical features (excluding the target variable and IDs)
numerical_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
    'time_signature', 'Release Year', 'Release Month', 'Release Day'
]

# Display summary statistics
df_cleaned[numerical_features].describe().transpose()

**Summary Statistics for Categorical Features**

In [None]:
# List of categorical features
categorical_features = ['key', 'mode', 'playlist_genre', 'playlist_subgenre']

# Display count of unique values and top frequencies
df_cleaned[categorical_features].describe().transpose()

** **Distribution Analysis****

Let's visualize the distribution of numerical features and analyze differences between low and high-popularity songs.


**Histograms for Numerical Features**

In [None]:
# Set up the matplotlib figure
fig, axes = plt.subplots(len(numerical_features), 1, figsize=(10, len(numerical_features)*3))

# Plot histograms
for idx, feature in enumerate(numerical_features):
    sns.histplot(data=df_cleaned, x=feature, hue='Popularity_Label', kde=True, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {feature} by Popularity Label')

plt.tight_layout()
plt.show()

**** Boxplots for Numerical Features********

In [None]:
# Set up the matplotlib figure
fig, axes = plt.subplots(len(numerical_features), 1, figsize=(10, len(numerical_features)*3))

# Plot boxplots
for idx, feature in enumerate(numerical_features):
    sns.boxplot(data=df_cleaned, x='Popularity_Label', y=feature, ax=axes[idx])
    axes[idx].set_title(f'Boxplot of {feature} by Popularity Label')

plt.tight_layout()
plt.show()

**Correlation Matrix**

In [None]:
# Compute the correlation matrix
corr_matrix = df_cleaned[numerical_features + ['track_popularity']].corr()

# Plot the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='RdBu_r', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

**Analysis of Categorical Features**



 Count Plots for Categorical Features

In [None]:
# Plot count plots for each categorical feature
for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.countplot(data=df_cleaned, x=feature, hue='Popularity_Label')
    plt.title(f'Count Plot of {feature} by Popularity Label')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

**Feature Relationships with Popularity**

 
 Scatter Plots

In [None]:
# Define a list of features to plot against popularity
features_to_plot = ['danceability', 'energy', 'loudness', 'valence', 'tempo']

# Plot scatter plots
for feature in features_to_plot:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df_cleaned, x=feature, y='track_popularity', hue='Popularity_Label')
    plt.title(f'{feature.capitalize()} vs. Track Popularity')
    plt.tight_layout()
    plt.show()

** Violin Plots**

In [None]:
# Plot violin plots to show the distribution of features
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.violinplot(data=df_cleaned, x='Popularity_Label', y=feature)
    plt.title(f'Violin Plot of {feature} by Popularity Label')
    plt.tight_layout()
    plt.show()

Pair Plot

In [None]:
# Select a subset of features to include in the pair plot to avoid clutter
selected_features = ['danceability', 'energy', 'loudness', 'valence', 'tempo', 'track_popularity']

# Create a pair plot
sns.pairplot(df_cleaned[selected_features + ['Popularity_Label']], hue='Popularity_Label')
plt.show()

**Statistical Tests**

** T-Tests Between Low and High-Popularity Songs**

We'll perform t-tests to determine if the difference in means of features between low and high-popularity songs is statistically significant.

In [None]:
from scipy.stats import ttest_ind

# Separate the dataframe into low and high popularity songs
df_low_pop = df_cleaned[df_cleaned['Popularity_Label'] == 'Low']
df_high_pop = df_cleaned[df_cleaned['Popularity_Label'] == 'High']

# Perform t-tests for numerical features
significant_features = []
alpha = 0.05  # Significance level

for feature in numerical_features:
    stat, p = ttest_ind(df_low_pop[feature], df_high_pop[feature], nan_policy='omit')
    print(f'{feature}: p-value = {p:.4f}')
    if p < alpha:
        significant_features.append(feature)
        print(f'  -> Significant difference in means (p < {alpha})\n')
    else:
        print(f'  -> No significant difference in means (p >= {alpha})\n')

**Correlation with Target Variable**

 Calculating Correlation with Track Popularity

In [None]:
# Calculate Pearson correlation coefficients between numerical features and track popularity
feature_correlations = df_cleaned[numerical_features + ['track_popularity']].corr()['track_popularity'].sort_values(ascending=False)

# Display the correlations
print('Correlation with Track Popularity:')
print(feature_correlations)

**Visualizing Feature Correlations**

In [None]:
# Plot the correlations
plt.figure(figsize=(8, 6))
feature_correlations.drop('track_popularity').plot(kind='bar')
plt.title('Correlation of Features with Track Popularity')
plt.xlabel('Features')
plt.ylabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

**Analysis of Release Date Features**¶

 
 Popularity Over Release Years

In [None]:
# Average popularity by release year
popularity_by_year = df_cleaned.groupby('Release Year')['track_popularity'].mean()

# Plot the trend
plt.figure(figsize=(12, 6))
popularity_by_year.plot()
plt.title('Average Track Popularity Over Years')
plt.xlabel('Release Year')
plt.ylabel('Average Popularity')
plt.tight_layout()
plt.show()

** Number of Tracks Released Over Years**

In [None]:
# Count of tracks by release year
tracks_per_year = df_cleaned.groupby('Release Year').size()

# Plot the counts
plt.figure(figsize=(12, 6))
tracks_per_year.plot(kind='bar')
plt.title('Number of Tracks Released Over Years')
plt.xlabel('Release Year')
plt.ylabel('Number of Tracks')
plt.tight_layout()
plt.show()

**Categorical Feature Analysis**

 
 Popularity by Playlist Genre

In [None]:
# Average popularity by playlist genre
popularity_by_genre = df_cleaned.groupby('playlist_genre')['track_popularity'].mean().sort_values(ascending=False)

# Plot the top genres
plt.figure(figsize=(12, 6))
popularity_by_genre.plot(kind='bar')
plt.title('Average Track Popularity by Playlist Genre')
plt.xlabel('Playlist Genre')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

**Popularity by Playlist Subgenre**

In [None]:
# Average popularity by playlist subgenre
popularity_by_subgenre = df_cleaned.groupby('playlist_subgenre')['track_popularity'].mean().sort_values(ascending=False)

# Plot the top subgenres
plt.figure(figsize=(12, 6))
popularity_by_subgenre.head(20).plot(kind='bar')
plt.title('Average Track Popularity by Playlist Subgenre (Top 20)')
plt.xlabel('Playlist Subgenre')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

** Mode and Key Distribution**

In [None]:
# Mode distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=df_cleaned, x='mode', hue='Popularity_Label')
plt.title('Distribution of Mode by Popularity Label')
plt.xlabel('Mode')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# Key distribution
plt.figure(figsize=(12, 6))
sns.countplot(data=df_cleaned, x='key', hue='Popularity_Label')
plt.title('Distribution of Key by Popularity Label')
plt.xlabel('Key')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


** Feature Engineering**

Handling Skewed Features


Some features might be skewed. We can apply transformations if necessary.

In [None]:
# Check skewness
skewed_features = df_cleaned[numerical_features].skew().sort_values(ascending=False)
print('Skewness of Numerical Features:')
print(skewed_features)

**Log Transformation of Highly Skewed Features**

In [None]:
# Identify features with high skewness
high_skewness = skewed_features[abs(skewed_features) > 0.5].index.tolist()

# Apply log transformation to positively skewed features
for feature in high_skewness:
    if (df_cleaned[feature] > 0).all():
        df_cleaned[feature] = np.log1p(df_cleaned[feature])

**Updating Feature Lists**

In [None]:
# Recalculate numerical features if any transformations changed data types
numerical_features = df_cleaned.select_dtypes(include=[np.number]).columns.tolist()

# Remove target variables and IDs from features
non_feature_columns = [
    'track_name', 'track_artist', 'track_album_name', 'track_id', 'track_album_id',
    'playlist_name', 'playlist_id', 'track_popularity', 'Popularity_Label', 'Popular',
    'id', 'track_href', 'analysis_url', 'uri', 'type', 'track_album_release_date'
]

features = [feature for feature in df_cleaned.columns if feature not in non_feature_columns]

**Encoding Categorical Variables**

* Identifying Categorical Variables
  
First, let's identify the categorical variables that need to be encoded.

In [None]:
# List of categorical features to encode
categorical_features = ['key', 'mode', 'playlist_genre', 'playlist_subgenre']

**Encoding Using One-Hot Encoding**

We'll use One-Hot Encoding to convert categorical variables into a format that can be provided to ML algorithms.

In [None]:
# Perform One-Hot Encoding on categorical features
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_features, drop_first=True)

# Display the first few rows of the encoded DataFrame
df_encoded.head()

**Scaling Numerical Features**

Identifying Numerical Features¶

In [None]:
# List of numerical features
numerical_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
    'time_signature', 'Release Year', 'Release Month', 'Release Day'
]

**Scaling Using StandardScaler**

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler to the numerical features
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])

# Display the first few rows of the scaled DataFrame
df_encoded.head()

 **Final Feature Set for Modeling**
 
 Defining Features and Targets

In [None]:
# Create the 'Popular' column first (using track_popularity as a basis)
df_encoded['Popular'] = (df_encoded['track_popularity'] > 50).astype(int)

# Target variables
y_classification = df_encoded['Popular']  # Binary classification target
y_regression = df_encoded['track_popularity']  # Regression target

# Drop non-feature columns for modeling
non_feature_columns = [
    'track_name', 'track_artist', 'track_name', 'track_href',
    'playlist_name', 'track_popularity', 'Popular',
    # Add any other non-feature columns you want to exclude
]

# Define feature matrix
X = df_encoded.drop(columns=non_feature_columns, errors='ignore')

# Verify shapes
print(f'Feature matrix shape: {X.shape}')
print(f'Classification target shape: {y_classification.shape}')
print(f'Regression target shape: {y_regression.shape}')

# You might also want to see the distribution of your new target variable
print("\nDistribution of 'Popular' classes:")
print(y_classification.value_counts(normalize=True))