# Linear Regression

## Statistical Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
test = pd.read_csv('Test Train Data/test.csv')
train = pd.read_csv('Test Train Data/train.csv')

In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8385 entries, 0 to 8384
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        8385 non-null   int64  
 1   id                8385 non-null   object 
 2   name              8385 non-null   object 
 3   duration_ms       8385 non-null   int64  
 4   explicit          8385 non-null   int64  
 5   artists           8385 non-null   object 
 6   id_artists        8385 non-null   object 
 7   release_date      8385 non-null   object 
 8   danceability      8385 non-null   float64
 9   energy            8385 non-null   float64
 10  key               8385 non-null   int64  
 11  loudness          8385 non-null   float64
 12  mode              8385 non-null   int64  
 13  speechiness       8385 non-null   float64
 14  acousticness      8385 non-null   float64
 15  instrumentalness  8385 non-null   float64
 16  liveness          8385 non-null   float64


In [13]:
categorical_columns = ['explicit', 'mode','key', 'language','time_signature'  ]
numerical_columns = ['duration_ms', 'tempo', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'valence']

In [14]:
X_train = train[numerical_columns + categorical_columns]
y_train = train['popularity']
X_test = test[numerical_columns + categorical_columns]
y_test = test['popularity']

In [15]:
# Create a column transformer to preprocess categorical and numerical columns
preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), categorical_columns),
    (StandardScaler(), numerical_columns)
)

In [16]:
# Create the regression model
model = LinearRegression()

In [17]:
# Combine column transformer and model into a Pipeline
pipeline = make_pipeline(preprocessor, model)

In [18]:
# Train the model on the training set
pipeline.fit(X_train, y_train)

In [19]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

## Model performance

In [20]:
# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R^2 Score: ", r2)

Mean Squared Error:  261.71944091809377
R^2 Score:  0.22851661153184266
