## Linear Regression Data Model Optimization: PCA

In [1]:
# Import all dependencies
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, text

# Preprocessing 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures

# Exploratory Data Analysis (EDA)
import seaborn as sns
import matplotlib.pyplot as plt

# Model Selection
from sklearn.linear_model import LinearRegression

# Stats
import statsmodels.api as sm

# Model Evaluation Metrics 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


## Load and Visualize the Spotify Data

In [2]:
# Create Database Connection
engine = create_engine("sqlite:///Resources/spotify_2023.sqlite", echo=False)

In [3]:
# Reflect Database into ORM Classes
Base = automap_base()
Base.prepare(autoload_with=engine, reflect=True)
MyTable = Base.classes.keys()

session = Session(engine)

  Base.prepare(autoload_with=engine, reflect=True)


In [4]:
# Create pandas df after loading the data from the SQLAlchemy engine
clean_spotify_df = pd.read_sql_table('cleaned_spotify_2023', engine)

col_list = []
for col in clean_spotify_df.columns:
    col_list.append(str(col))
clean_spotify_df.columns=col_list

# Rename columns for context
cleaned_spotify_df = clean_spotify_df.rename(columns={'artist(s)_name': 'artists_name',
                        'danceability_%': 'danceability',
                        'valence_%': 'valence',
                        'energy_%': 'energy', 
                        'acousticness_%': 'acousticness', 
                        'instrumentalness_%': 'instrumentalness',  
                        'liveness_%': 'liveness',
                        'speechiness_%': 'speechiness'
                        })

# Verify the change
print(cleaned_spotify_df.columns)

Index(['ID', 'track_name', 'artists_name', 'contributing_artist_count',
       'released_year', 'released_month', 'in_spotify_playlists',
       'spotify_chart_rank', 'streams', 'in_apple_playlists',
       'apple_chart_rank', 'in_deezer_playlists', 'deezer_chart_rank',
       'shazam_chart_rank', 'bpm', 'key', 'mode', 'danceability', 'valence',
       'energy', 'acousticness', 'instrumentalness', 'liveness',
       'speechiness'],
      dtype='object')


In [5]:
# Print udpated df
cleaned_spotify_df.head()

Unnamed: 0,ID,track_name,artists_name,contributing_artist_count,released_year,released_month,in_spotify_playlists,spotify_chart_rank,streams,in_apple_playlists,...,bpm,key,mode,danceability,valence,energy,acousticness,instrumentalness,liveness,speechiness
0,1,seven feat latto explicit ver,latto jung kook,2,2023,7,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,2,lala,myke towers,1,2023,3,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,3,vampire,olivia rodrigo,1,2023,6,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,4,cruel summer,taylor swift,1,2019,8,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,5,where she goes,bad bunny,1,2023,5,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [6]:
# Drop irrelevant columns 
# new_df = cleaned_spotify_df.drop(columns=['ID', 'track_name', 'artists_name', 'key', 'mode'])
# new_df.head()

In [7]:
# Drop all columns but target (streams) and 'in_spotify_playlists','in_apple_playlists', and 'in_deezer_playlists'
new_df = cleaned_spotify_df.drop(columns=['ID', 'track_name', 'artists_name', 'contributing_artist_count',
       'released_year', 'released_month',  'bpm', 'key', 'mode', 'danceability', 'valence',
       'energy', 'acousticness', 'instrumentalness', 'liveness', 'spotify_chart_rank','apple_chart_rank', 'deezer_chart_rank','shazam_chart_rank',   
       'speechiness'])
new_df.head()

Unnamed: 0,in_spotify_playlists,streams,in_apple_playlists,in_deezer_playlists
0,553,141381703,43,45
1,1474,133716286,48,58
2,1397,140003974,94,91
3,7858,800840817,116,125
4,3133,303236322,84,87


## Remove outliers

In [8]:
# Calculate quartiles and IQR
quartiles = np.quantile(new_df['streams'], [.25, .75])
iqr = quartiles[1] - quartiles[0]

# Determine the lower and upper bounds
lower_bound = quartiles[0] - (1.5 * iqr)
upper_bound = quartiles[1] + (1.5 * iqr)

# Identify potential outliers
potential_outliers = new_df[(new_df['streams'] < lower_bound) | (new_df['streams'] > upper_bound)]

# Remove outliers from the original DataFrame
cleaned_df = new_df[(new_df['streams'] >= lower_bound) & (new_df['streams'] <= upper_bound)]

# Display the cleaned DataFrame
cleaned_df.head()

Unnamed: 0,in_spotify_playlists,streams,in_apple_playlists,in_deezer_playlists
0,553,141381703,43,45
1,1474,133716286,48,58
2,1397,140003974,94,91
3,7858,800840817,116,125
4,3133,303236322,84,87


In [9]:
# Updated df info
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750 entries, 0 to 816
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   in_spotify_playlists  750 non-null    int64
 1   streams               750 non-null    int64
 2   in_apple_playlists    750 non-null    int64
 3   in_deezer_playlists   750 non-null    int64
dtypes: int64(4)
memory usage: 29.3 KB


## Define feature set (X) and target variable (y)

In [10]:
# Define your feature set and target variable
X = cleaned_df.drop(columns=['streams'])  # All columns except the target
y = cleaned_df['streams']  # Target column

# Add constant to features for intercept
X = sm.add_constant(X)

# Fit initial Linear Regression model.
initial_model = sm.OLS(y, X).fit()
print(initial_model.summary())


                            OLS Regression Results                            
Dep. Variable:                streams   R-squared:                       0.553
Model:                            OLS   Adj. R-squared:                  0.551
Method:                 Least Squares   F-statistic:                     307.4
Date:                Wed, 16 Oct 2024   Prob (F-statistic):          7.10e-130
Time:                        20:41:57   Log-Likelihood:                -15399.
No. Observations:                 750   AIC:                         3.081e+04
Df Residuals:                     746   BIC:                         3.083e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 1.573e+08 

## Preprocessing of the Spotify df

In [11]:
# Scale the features data 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  



## Use PCA to reduce dimensionality of the transformed df to two principal components & use the explained_variance_ratio_ function from PCA, calculate the percentage of the total variance that is captured by the two PCA variables.

In [12]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=2)

# Fit the PCA model on the transformed Spotify DataFrame
spotify_pca = pca.fit_transform(X_scaled)

# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_


array([0.6866157 , 0.19789068])

In [13]:
# Create a DataFrame with the PCA components
pca_df = pd.DataFrame(data=spotify_pca, columns=[f'PC{i+1}' for i in range(spotify_pca.shape[1])])

## Split the data into trainning and testing

In [14]:
# Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(pca_df, y, test_size=0.2, random_state=42)

## Fit the linear regression model into the training data to make predictions

In [15]:
# Fit Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

In [16]:
# Make Predictions
y_pred = model.predict(X_test)

## Evaluate the model's performance

In [17]:
# Evaluate the Model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print MSE & R2 scores 
print(f'Model Mean Squared Error: {mse}')
print(f'Model R-squared: {r2}')

# Print Model's performance on the testing data
print(f'Model performance on the testing data: {model.score(X_test, y_test)}')

# Print Model's performance on the training data 
print(f'Model performance on the training data: {model.score(X_train, y_train)}')

Model Mean Squared Error: 4.263967722026081e+16
Model R-squared: 0.573871930814909
Model performance on the testing data: 0.573871930814909
Model performance on the training data: 0.532471416681798
