In [1]:
# Installs and imports
try:
    import xgboost as xgb
except:
    !pip install xgboost
    import xgboost as xgb
    !pip install --upgrade xgboost

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

In [2]:
# Load the data
data = pd.read_csv('Spotify_Youtube.csv')
data.head()

# Clean the data
data.dropna()
data.fillna(data.median(), inplace=True) 
data.drop('Unnamed: 0', axis=1, inplace=True)
target_col = 'Views' 

# Create new features
data['User_engagement'] = data['Comments'] / data['Views']
data['View_like_ratio'] = data['Likes'] / data['Views']

# Set features and target
features = data.drop(target_col, axis=1)  
target = data[target_col]

data.head()

  data.fillna(data.median(), inplace=True)


Unnamed: 0,Artist,Url_spotify,Track,Album,Album_type,Uri,Danceability,Energy,Key,Loudness,...,Channel,Views,Likes,Comments,Description,Licensed,official_video,Stream,User_engagement,View_like_ratio
0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,Demon Days,album,spotify:track:0d28khcov6AiegSCpG5TuT,0.818,0.705,6.0,-6.679,...,Gorillaz,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...,True,True,1040235000.0,0.000245,0.00897
1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,Plastic Beach,album,spotify:track:1foMv2HQwfQ2vntFf9HFeG,0.676,0.703,8.0,-5.815,...,Gorillaz,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...,True,True,310083700.0,0.000431,0.014985
2,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,New Gold (feat. Tame Impala and Bootie Brown),New Gold (feat. Tame Impala and Bootie Brown),single,spotify:track:64dLd6rVqDLtkXFYrEUHIU,0.695,0.923,1.0,-3.93,...,Gorillaz,8435055.0,282142.0,7399.0,Gorillaz - New Gold ft. Tame Impala & Bootie B...,True,True,63063470.0,0.000877,0.033449
3,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,On Melancholy Hill,Plastic Beach,album,spotify:track:0q6LuUqGLUiCPP1cbdwFs3,0.689,0.739,2.0,-5.81,...,Gorillaz,211754952.0,1788577.0,55229.0,Follow Gorillaz online:\nhttp://gorillaz.com \...,True,True,434663600.0,0.000261,0.008446
4,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Clint Eastwood,Gorillaz,album,spotify:track:7yMiX7n9SBvadzox8T5jzT,0.663,0.694,10.0,-8.627,...,Gorillaz,618480958.0,6197318.0,155930.0,The official music video for Gorillaz - Clint ...,True,True,617259700.0,0.000252,0.01002


In [3]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Preprocessing pipeline for numeric features
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessing pipeline
preprocessor = ColumnTransformer([
    ('numeric', numeric_transformer, ['Danceability', 'Energy', 'Key', 'Likes', 'Comments', 'Stream', 
                                     'User_engagement', 'View_like_ratio']),
    ('categorical', categorical_transformer, ['Artist', 'Url_spotify', 'Track', 'Album', 'Album_type', 'Uri',
                        'Url_youtube', 'Title', 'Channel'])
])

# Create the Linear Regression model
linear_model = LinearRegression()

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', linear_model)
])


In [4]:
# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

In [5]:
# Get top 10 predictions
y_train_pred = pipeline.predict(X_train)
top_10_indices = np.argsort(y_train_pred)[::-1][:10]
top_10_predictions = y_train.iloc[top_10_indices]
top_10_features = X_train.iloc[top_10_indices]

# Print top 10 predicting features
print(f"Top 10 predictive features")
for i, (prediction, features) in enumerate(zip(top_10_predictions, top_10_features)):
    print(f"Feature {i+1}: {features}")
print()

Top 10 predictive features
Feature 1: Artist
Feature 2: Url_spotify
Feature 3: Track
Feature 4: Album
Feature 5: Album_type
Feature 6: Uri
Feature 7: Danceability
Feature 8: Energy
Feature 9: Key
Feature 10: Loudness



In [6]:
# Make predictions on test data
y_pred = pipeline.predict(X_test)

# Calculate the accuracy (R-squared) and root mean squared error (RMSE)
accuracy = pipeline.score(X_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print the results
print("Accuracy: {:.2f}".format(accuracy))
print("RMSE: {:.2f}".format(rmse))

Accuracy: 0.90
RMSE: 85131701.74
