In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [51]:
def plot_confusion_matrix(model, X_test, y_test, labels):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred, labels=range(len(labels)))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(xticks_rotation='vertical')
    plt.tight_layout()
    plt.show()

def print_classification_report(model, X_test, y_test, labels):
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=labels)
    print(report)

def imputar_valores(data, feature_to_impute, features_available):
    not_null = data[data[feature_to_impute].notnull() & data[features_available].notnull().all(axis=1)]
    X = not_null[features_available]
    y = not_null[feature_to_impute]

    model = LinearRegression()
    model.fit(X, y)

    null = data[data[feature_to_impute].isnull() & data[features_available].notnull().all(axis=1)]
    X_null = null[features_available]
    pred = model.predict(X_null)
    data.loc[X_null.index, feature_to_impute] = pred
    
def grid_search_training(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

In [52]:

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_x.csv')

imputar_valores(train_data, 'Energia', ['Acustica', 'Volumen'])
imputar_valores(train_data, 'Volumen', ['Acustica', 'Energia'])

train_data["Explicitud"] = train_data["Explicitud"].fillna(train_data["Explicitud"].mode()[0])
train_data["Palabrería"] = train_data["Palabrería"].fillna(train_data["Palabrería"].mean())
train_data["Tempo"] = train_data["Tempo"].fillna(train_data["Tempo"].mean())
label_encoder = LabelEncoder()
train_data['Genero_del_track'] = label_encoder.fit_transform(train_data['Genero_del_track'])

X = train_data.drop(columns=['ID_track', 'Album', 'Nombre_del_track', 'Artista'])
y = train_data['Artista']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y_encoded = LabelEncoder().fit_transform(y)
labels = np.unique(y)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

  train_data["Explicitud"] = train_data["Explicitud"].fillna(train_data["Explicitud"].mode()[0])


In [None]:
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'max_depth': [4, 6],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8]
}

param_grid_rf = {
    'n_estimators': [400],
}

model_choice = 'rf'

if model_choice == 'xgb':
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    param_grid = param_grid_xgb
elif model_choice == 'rf':
    model = RandomForestClassifier(random_state=42)
    param_grid = param_grid_rf

best_model = grid_search_training(model, param_grid, X_train, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits




In [54]:
test_data["Explicitud"] = test_data["Explicitud"].fillna(test_data["Explicitud"].mode()[0])
test_data["Palabrería"] = test_data["Palabrería"].fillna(test_data["Palabrería"].mean())
test_data["Tempo"] = test_data["Tempo"].fillna(test_data["Tempo"].mean())
test_data['Genero_del_track'] = label_encoder.transform(test_data['Genero_del_track'])

X_test = test_data.drop(columns=['ID_track', 'Album', 'Nombre_del_track'])
X_test_scaled = scaler.transform(X_test)


best_model.fit(X_scaled, y_encoded)
test_predictions = best_model.predict(X_test_scaled)
test_predictions_decoded = LabelEncoder().fit(y).inverse_transform(test_predictions)

submission = pd.DataFrame({'Id': test_data['Id'], 'Artista': test_predictions_decoded})
submission.to_csv('submission.csv', index=False)

print("Predicciones guardadas en 'submission.csv'")

  test_data["Explicitud"] = test_data["Explicitud"].fillna(test_data["Explicitud"].mode()[0])


Predicciones guardadas en 'submission.csv'
