## Importação

### Importando blibiotecas

In [104]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

### Importando Dataframes

In [105]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
pd.set_option("display.max_columns", None)

## Análise exploratória e pré processamento (Train)

In [106]:
df_train.head()

Unnamed: 0,track_unique_id,track_id,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,popularity_target
0,41996,7hUhmkALyQ8SX9mJs5XI3D,Love and Rockets,Love and Rockets,Motorcycle,211533,False,0.305,0.849,9,-10.795,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,goth,0
1,76471,5x59U89ZnjZXuNAAlc8X1u,Filippa Giordano,Filippa Giordano,"Addio del passato - From ""La traviata""",196000,False,0.287,0.19,7,-12.03,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,opera,0
2,54809,70Vng5jLzoJLmeLu3ayBQq,Susumu Yokota,Symbol,Purple Rose Minuet,216506,False,0.583,0.509,1,-9.661,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,idm,1
3,16326,1cRfzLJapgtwJ61xszs37b,Franz Liszt;YUNDI,Relajación y siestas,"Liebeslied (Widmung), S. 566",218346,False,0.163,0.0368,8,-23.149,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,classical,0
4,109799,47d5lYjbiMy0EdMRV8lRou,Scooter,Scooter Forever,The Darkside,173160,False,0.647,0.921,2,-7.294,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,techno,0


### Checagem de nulos

In [107]:
df_train.isnull().sum()

track_unique_id      0
track_id             0
artists              0
album_name           0
track_name           0
duration_ms          0
explicit             0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
track_genre          0
popularity_target    0
dtype: int64

### Separação de Colunas categoricas e numericas

In [108]:
df_train.dtypes

track_unique_id        int64
track_id              object
artists               object
album_name            object
track_name            object
duration_ms            int64
explicit                bool
danceability         float64
energy               float64
key                    int64
loudness             float64
mode                   int64
speechiness          float64
acousticness         float64
instrumentalness     float64
liveness             float64
valence              float64
tempo                float64
time_signature         int64
track_genre           object
popularity_target      int64
dtype: object

In [109]:
categorical_cols = df_train.select_dtypes(include=['object']).columns
numerical_cols = df_train.select_dtypes(include=[np.number]).columns

### Encoding das variaveis

In [110]:
for coluna in categorical_cols:
    unique_values_count = df_train[coluna].nunique()
    print(f"Coluna '{coluna}' - Número de valores únicos: {unique_values_count}")

Coluna 'track_id' - Número de valores únicos: 66720
Coluna 'artists' - Número de valores únicos: 25775
Coluna 'album_name' - Número de valores únicos: 37315
Coluna 'track_name' - Número de valores únicos: 55767
Coluna 'track_genre' - Número de valores únicos: 114


In [111]:
label_encoder = LabelEncoder()


for col in categorical_cols:
    df_train[col] = label_encoder.fit_transform(df_train[col])

df_train.head()

Unnamed: 0,track_unique_id,track_id,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre,popularity_target
0,41996,64239,13481,18054,29483,211533,False,0.305,0.849,9,-10.795,1,0.0549,5.8e-05,0.0567,0.464,0.32,141.793,4,41,0
1,76471,50962,7754,10815,1549,196000,False,0.287,0.19,7,-12.03,0,0.037,0.93,0.000356,0.0834,0.133,83.685,4,76,0
2,54809,60021,21526,28726,35849,216506,False,0.583,0.509,1,-9.661,1,0.0362,0.777,0.202,0.115,0.544,90.459,3,54,1
3,16326,13905,8029,24834,25598,218346,False,0.163,0.0368,8,-23.149,1,0.0472,0.991,0.899,0.107,0.0387,69.442,3,16,0
4,109799,35296,20008,26251,44199,173160,False,0.647,0.921,2,-7.294,1,0.185,0.000939,0.371,0.131,0.171,137.981,4,109,0


In [112]:
df_train.drop(columns=['track_unique_id', 'track_id', 'artists','track_name'], inplace=True)

## Análise exploratória e pré processamento (Test)

In [113]:
df_test.head()

Unnamed: 0,track_unique_id,track_id,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,113186,6KwkVtXm8OUp2XffN5k7lY,Hillsong Worship,No Other Name,No Other Name,440247,False,0.369,0.598,7,-6.984,1,0.0304,0.00511,0.0,0.176,0.0466,148.014,4,world-music
1,42819,2dp5I5MJ8bQQHDoFaNRFtX,Internal Rot,Grieving Birth,Failed Organum,93933,False,0.171,0.997,7,-3.586,1,0.118,0.00521,0.801,0.42,0.0294,122.223,4,grindcore
2,59311,5avw06usmFkFrPjX8NxC40,Zhoobin Askarieh;Ali Sasha,Noise A Noise 20.4-1,"Save the Trees, Pt. 1",213578,False,0.173,0.803,9,-10.071,0,0.144,0.613,0.00191,0.195,0.0887,75.564,3,iranian
3,91368,75hT0hvlESnDJstem0JgyR,Bryan Adams,All I Want For Christmas Is You,Merry Christmas,151387,False,0.683,0.511,6,-5.598,1,0.0279,0.406,0.000197,0.111,0.598,109.991,3,rock
4,61000,4bY2oZGA5Br3pTE1Jd1IfY,Nogizaka46,バレッタ TypeD,月の大きさ,236293,False,0.555,0.941,9,-3.294,0,0.0481,0.484,0.0,0.266,0.813,92.487,4,j-idol


### Checagem de nulos

In [114]:
df_test.isnull().sum()

track_unique_id     0
track_id            0
artists             1
album_name          1
track_name          1
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

### Separação de Colunas categoricas e numericas

In [115]:
df_test.dtypes

track_unique_id       int64
track_id             object
artists              object
album_name           object
track_name           object
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre          object
dtype: object

In [116]:
categorical_cols = df_test.select_dtypes(include=['object']).columns
numerical_cols = df_test.select_dtypes(include=[np.number]).columns

### Encoding das variaveis

In [117]:
for coluna in categorical_cols:
    unique_values_count = df_test[coluna].nunique()
    print(f"Coluna '{coluna}' - Número de valores únicos: {unique_values_count}")

Coluna 'track_id' - Número de valores únicos: 31331
Coluna 'artists' - Número de valores únicos: 15308
Coluna 'album_name' - Número de valores únicos: 20582
Coluna 'track_name' - Número de valores únicos: 27539
Coluna 'track_genre' - Número de valores únicos: 114


In [118]:
label_encoder = LabelEncoder()


for col in categorical_cols:
    df_test[col] = label_encoder.fit_transform(df_test[col])

df_test.head()

Unnamed: 0,track_unique_id,track_id,artists,album_name,track_name,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,113186,25507,5751,11698,15371,440247,False,0.369,0.598,7,-6.984,1,0.0304,0.00511,0.0,0.176,0.0466,148.014,4,113
1,42819,10640,6116,7004,7331,93933,False,0.171,0.997,7,-3.586,1,0.118,0.00521,0.801,0.42,0.0294,122.223,4,42
2,59311,22435,14892,11747,19101,213578,False,0.173,0.803,9,-10.071,0,0.144,0.613,0.00191,0.195,0.0887,75.564,3,59
3,91368,28552,2086,1075,14049,151387,False,0.683,0.511,6,-5.598,1,0.0279,0.406,0.000197,0.111,0.598,109.991,3,90
4,61000,18449,9792,20017,27171,236293,False,0.555,0.941,9,-3.294,0,0.0481,0.484,0.0,0.266,0.813,92.487,4,61


In [119]:
df_test.drop(columns=['track_id', 'artists','track_name'], inplace=True)

## Modelo

In [120]:
X = df_train.drop('popularity_target', axis=1)
y = df_train['popularity_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [121]:
model = RandomForestClassifier(
    n_estimators=280, 
    max_depth=30, 
    min_samples_split=6, 
    min_samples_leaf=1, 
    bootstrap=False, 
    random_state=42
)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.82


In [122]:

y_test_pred = model.predict(df_test.drop('track_unique_id', axis=1))


df_results = pd.DataFrame({
    'track_unique_id': df_test['track_unique_id'],
    'popularity_target': y_test_pred
})


print(df_results.head())


df_results.to_csv('submission.csv', index=False)

   track_unique_id  popularity_target
0           113186                  1
1            42819                  0
2            59311                  0
3            91368                  0
4            61000                  0
