In [39]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.feature_selection import chi2, mutual_info_classif, f_regression, f_classif

### Loading the modified dataset

In [40]:
df = pd.read_csv('data_merged.csv')
list(df.columns)
del df['mode']
del df['key_6_yr']
del df['popularity_yr']
del df['popularity_ar']

In [41]:
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,liveness,loudness,popularity,speechiness,...,loudness_yr,speechiness_yr,tempo_yr,valence_yr,key_0_yr,key_1_yr,key_2_yr,key_3_yr,key_4_yr,key_5_yr
0,0.0131,0.256,182347,0.895,0,0.000106,0.0821,-4.86,29,0.0707,...,-11.095111,0.064961,120.209319,0.583424,0,0,1,0,0,0
1,0.98,0.277,206972,0.145,0,0.879,0.111,-19.898,0,0.0845,...,-15.414304,0.092591,110.008113,0.432251,1,0,0,0,0,0
2,0.795,0.685,314667,0.483,0,0.878,0.113,-10.202,1,0.0337,...,-15.342991,0.103243,108.561912,0.447291,0,0,0,1,0,0
3,0.656,0.788,179747,0.808,0,0.0,0.154,-6.59,0,0.0395,...,-15.342991,0.103243,108.561912,0.447291,0,0,0,1,0,0
4,0.302,0.0753,498560,0.15,0,0.884,0.121,-16.705,0,0.0371,...,-15.724956,0.107351,109.569882,0.443625,1,0,0,0,0,0


### Linear Regression

In [64]:
X = df[df.columns.difference(['popularity'])]
y = df['popularity']
X_new = SelectKBest(f_regression, k=25).fit_transform(X, y)
X_new.shape

(172230, 25)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2)
clf = LinearRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))

Coefficient of determination: 0.52
Mean squared error: 232.10


### Polynomial Regression

In [66]:
poly = PolynomialFeatures(degree=2) ## Degree 3 kills my PC! I think the issue is not enough RAM. 
X_poly = poly.fit_transform(X_new)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_poly, y, test_size=0.2)
clf1 = LinearRegression().fit(X_train1, y_train1)
y_pred1 = clf1.predict(X_test1)
print('Coefficient of determination: %.2f'
      % r2_score(y_test1, y_pred1))
print('Mean squared error: %.2f'
      % mean_squared_error(y_test1, y_pred1))

Coefficient of determination: 0.61
Mean squared error: 184.56


With all features polynomial regression for degree = 2 gives an r2_score of 0.64

###  Logistic Regression

#### With 3 categories

In [45]:
X1 = df[df.columns.difference(['popularity'])]
y1 = pd.cut(x=df["popularity"], bins=[-1,20,50,100], labels=[0,1,2]) ## Threshold can be either 50 (median) or 26 (mean)
X_new1 = SelectKBest(f_classif, k=27).fit_transform(X1, y1)
X_new1.shape

(172230, 27)

Here, a maximum accuracy score of 0.641 is achieved upon selecting 27 features, after which the score more or less plateaus and the optimization function fails to converge within a small number of iterations. 

Increasing the max number of iterations allows it to converge but the accuracy score does not see any improvement. 

In [46]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_new1, y1, test_size=0.20)
clf2 = LogisticRegression(random_state=42).fit(X_train2, y_train2)
y_pred2 = clf2.predict(X_test2)
accuracy_score(y_test2, y_pred2)

0.6412065261568832

Why does the accuracy score give different values everytime I run it? I have fixed the random_state. 

#### With 2 categories

In [67]:
y1 = pd.cut(x=df["popularity"], bins=[-1,50,100], labels=[0,1]) ## Threshold can be either 50 (median) or 26 (mean)
X_new1 = SelectKBest(f_classif, k=27).fit_transform(X1, y1)
X_new1.shape

(172230, 27)

In [68]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_new1, y1, test_size=0.2)
clf2 = LogisticRegression(random_state=42).fit(X_train2, y_train2)
y_pred2 = clf2.predict(X_test2)
accuracy_score(y_test2, y_pred2)

0.8426812982639493

Convergence sometimes happens, sometimes not. 