In [1]:
# Import the necessary libraries to use for this assignment
import pandas as pd
import csv
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

In [2]:
# Load the dataset
dfSpotYou = pd.read_csv('Spotify_Youtube.csv', sep=',', on_bad_lines='skip')

In [3]:
# Pre-process necessary features:

# Drop unnecessary columns
dfSpotYou = dfSpotYou.drop(['Artist', 'Album', 'Album_type', 'Uri', 'Title', 'Channel', 'Description', 'Licensed', 'official_video', 'Unnamed: 0', 'Url_spotify', 'Url_youtube'], axis=1)

# Drop rows with missing values
dfSpotYou = dfSpotYou.dropna()

# Apply feature scaling to numerical columns
scaler = MinMaxScaler()
dfSpotYou[['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness',
    'Liveness', 'Valence', 'Tempo']] = scaler.fit_transform(dfSpotYou[['Danceability', 'Energy', 'Loudness',
                                                                'Speechiness', 'Acousticness',
                                                                'Instrumentalness', 'Liveness',
                                                                'Valence', 'Tempo']])

# Convert categorical columns to numerical using one-hot encoding
dfSpotYou = pd.get_dummies(dfSpotYou)

In [4]:
# Design two new features
dfSpotYou['Popularity_score'] = dfSpotYou['Views'] + dfSpotYou['Likes'] + dfSpotYou['Comments']
dfSpotYou['Duration_minutes'] = dfSpotYou['Duration_ms'] / 60000

In [5]:
# Predict number of Spotify streams

# Select relevant features and target variable
features = ['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Popularity_score', 'Duration_minutes']
target = 'Stream'

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dfSpotYou[features], dfSpotYou[target], test_size=0.2, random_state=42)

# Define the pipeline steps
pipeline = Pipeline([
    ('scaling', StandardScaler()),  # Data scaling step
    ('model', LinearRegression())  # Model training step
])

# Start the timer
start_time = time.time()

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = pipeline.predict(X_test)

# Stop the timer and calculate the elapsed time
elapsed_time = time.time() - start_time

# Evaluate the model using mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Print the MSE score and elapsed time
print("Mean Squared Error (MSE):", mse)
print("Elapsed Time:", elapsed_time, "seconds")


Mean Squared Error (MSE): 3.471938667761583e+16
Elapsed Time: 0.04188680648803711 seconds


In [6]:
# Find top 10 predictive features according to 3 different methods

# Calculate feature-target correlations
correlations = dfSpotYou[features + [target]].corr()

# Sort features by their correlation with the target variable
top_features_cor = correlations[target].abs().sort_values(ascending=False).index.tolist()[1:11]

print('Correlation:', top_features_cor)

# Perform univariate feature selection
selector = SelectKBest(f_regression, k=10)
selector.fit(X_train, y_train)

# Get the selected features
top_features_univariate = [features[i] for i in selector.get_support(indices=True)]

print('\nUnivariate Feature Selection:', top_features_univariate)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
top_features_lasso = [features[i] for i, coef in enumerate(lasso.coef_) if coef != 0][:10]

print('\nCoef:', top_features_lasso)

Correlation: ['Popularity_score', 'Loudness', 'Acousticness', 'Instrumentalness', 'Danceability', 'Energy', 'Liveness', 'Speechiness', 'Valence', 'Duration_minutes']

Univariate Feature Selection: ['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Popularity_score']

Coef: ['Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo']


  model = cd_fast.enet_coordinate_descent(


In [7]:
# Report score/accuracy in at least 2 different formats
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Example 2: Reporting R-squared (coefficient of determination)
r2 = pipeline.score(X_test, y_test)
print(f"R-squared: {r2}")

Mean Squared Error: 3.471938667761583e+16
R-squared: 0.3775322948825087
