In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [2]:
top = pd.read_csv('top200.csv')
bottom = pd.read_csv('bottom200.csv')
top['type'] = 0 # 0 for top
bottom['type'] = 1 # 1 for bottom
df = pd.concat([top, bottom], ignore_index=True)
df.head()

Unnamed: 0,title,artist,genre,rank,danceability,energy,key,loudness,mode,speechiness,...,tempo,duration_ms,time_signature,lyrics,winter,spring,summer,autumn,id,type
0,TOUT VA BIEN (feat. Ninho & Naps),Alonzo,"francoton, french hip hop, pop urbaine, rap fr...",1,0.66,0.72,3.0,-5.874,0.0,0.0753,...,99.937,192960.0,4.0,"[Paroles de ""TOUT VA BIEN"" ft. Naps & Ninho] [...",0,1,0,0,6OZwia8loN0aPS0vTvsBjR,0
1,FADE UP,ZEG P,,2,0.758,0.599,4.0,-5.99,0.0,0.0877,...,129.96,219188.0,4.0,"[Paroles de ""FADE UP"" ft. Hamza & SCH] [Intro ...",0,0,1,0,4ZpIuzx91EAPK3VimONbfB,0
2,DIE,Gazo,"drill francais, rap francais",3,0.695,0.63,8.0,-7.16,0.0,0.035,...,130.968,240413.0,4.0,"[Paroles de ""DIE""] [Intro] La mala est gangx E...",0,0,1,0,3D29kjUyWxsT3jUUTtARVQ,0
3,PETETE,Gambi,"french hip hop, pop urbaine, rap francais",4,0.752,0.669,10.0,-9.817,0.0,0.251,...,155.997,123846.0,4.0,"[Paroles de ""PETETE""] [Intro] Pew Grr Pew pew ...",0,0,1,0,0z3bi63SNZ5ylyHOzb81Uq,0
4,Time Time,Trei Degete,,5,0.715,0.799,0.0,-6.447,0.0,0.0433,...,128.925,156373.0,4.0,"[Paroles de ""Time Time""] [Couplet 1 : Squeezie...",0,0,0,1,5wKDPtbdggE1roeVp3UdXX,0


In [3]:
df = df.drop(['title', 'artist', 'rank', 'lyrics', 'genre', 'id'], axis = 1)
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,winter,spring,summer,autumn,type
0,0.66,0.72,3.0,-5.874,0.0,0.0753,0.301,0.0,0.226,0.628,99.937,192960.0,4.0,0,1,0,0,0
1,0.758,0.599,4.0,-5.99,0.0,0.0877,0.57,1e-06,0.129,0.557,129.96,219188.0,4.0,0,0,1,0,0
2,0.695,0.63,8.0,-7.16,0.0,0.035,0.229,0.0,0.118,0.55,130.968,240413.0,4.0,0,0,1,0,0
3,0.752,0.669,10.0,-9.817,0.0,0.251,0.167,0.0,0.0795,0.627,155.997,123846.0,4.0,0,0,1,0,0
4,0.715,0.799,0.0,-6.447,0.0,0.0433,0.0141,0.0035,0.342,0.916,128.925,156373.0,4.0,0,0,0,1,0


In [7]:
(df.isnull().sum()/df.shape[0]).sort_values(ascending=True) * 100

danceability        0.0
summer              0.0
spring              0.0
winter              0.0
time_signature      0.0
duration_ms         0.0
tempo               0.0
valence             0.0
liveness            0.0
instrumentalness    0.0
acousticness        0.0
speechiness         0.0
mode                0.0
loudness            0.0
key                 0.0
energy              0.0
autumn              0.0
type                0.0
dtype: float64

In [5]:
def fill_empty(df):

    fill0 = ['time_signature','duration_ms','tempo','valence','danceability','acousticness','speechiness','mode', 'loudness', 'key', 'energy',
            'instrumentalness', 'liveness']

    for col in fill0:
        df[col].fillna(0, inplace=True)
        
    return df

In [6]:
df = fill_empty(df)

In [8]:
df.shape

(2372, 18)

In [9]:
X = df.drop('type', axis=1)
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
from sklearn.preprocessing import RobustScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif 
model = make_pipeline(RobustScaler(), SelectKBest(f_classif, k=17), LogisticRegression(max_iter=1000))

In [15]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [16]:
def evaluation(model):

    model.fit(X_train, y_train)
        
    ypred = model.predict(X_test)
        
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))

In [17]:
evaluation(model)

[[234  45]
 [129  67]]
              precision    recall  f1-score   support

           0       0.64      0.84      0.73       279
           1       0.60      0.34      0.44       196

    accuracy                           0.63       475
   macro avg       0.62      0.59      0.58       475
weighted avg       0.63      0.63      0.61       475

