In [None]:
!pip install tensorflow # pour ceux qui l'ont pas 

In [None]:
import numpy as np
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
tracks = pd.read_csv("tracks.tsv", sep="\t", dtype={"track_id": str})
echonest = pd.read_csv("echonest_features.tsv", sep="\t", dtype={"track_id": str})
spectral = pd.read_csv("spectral_features.tsv", sep="\t", dtype={"track_id": str})
genres = pd.read_csv("genres.csv")


In [None]:
print("tracks_columns :", tracks.columns)
print("echonest_columns :", echonest.columns)
print("spectral_columns :", spectral.columns)   
print("genres_columns :", genres.columns)

tracks_columns : Index(['track_id', 'album_title', 'album_tracks', 'artist_latitude',
       'artist_longitude', 'artist_name', 'duration', 'favorites', 'genre_top',
       'genres', 'genres_all', 'interest', 'listens', 'title'],
      dtype='object')
echonest_columns : Index(['track_id', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence'],
      dtype='object')
spectral_columns : Index(['track_id', 'spectral_bandwidth_kurtosis_01',
       'spectral_bandwidth_max_01', 'spectral_bandwidth_mean_01',
       'spectral_bandwidth_median_01', 'spectral_bandwidth_min_01',
       'spectral_bandwidth_skew_01', 'spectral_bandwidth_std_01',
       'spectral_centroid_kurtosis_01', 'spectral_centroid_max_01',
       'spectral_centroid_mean_01', 'spectral_centroid_median_01',
       'spectral_centroid_min_01', 'spectral_centroid_skew_01',
       'spectral_centroid_std_01', 'spectral_rolloff_kurtosis_01',
       'spectral_rolloff_max_01',

In [None]:
def clean_cols(df):
    df.columns = (df.columns
                    .str.lower()
                    .str.strip()
                    .str.replace(" ", "_")
                    .str.replace("-", "_")
                    .str.replace(r"[^a-zA-Z0-9_]", "", regex=True))
    return df
        
tracks = clean_cols(tracks)
echonest = clean_cols(echonest)
spectral = clean_cols(spectral)
genres = clean_cols(genres)


In [21]:
df = tracks.merge(echonest, on="track_id", how="left") \
           .merge(spectral, on="track_id", how="left")


In [None]:
#df.to_csv("tracks_merged.csv", index=False) # Les 3 fichiers merged 

In [None]:
#Création de colonnes pour les genres

genres_df = pd.read_csv("genres.csv")

genres_df = genres_df.rename(columns={
    "genre_id": "id",
    "genre_title": "name"
})

# dictionnaire : id -> nom
id_to_name = dict(zip(genres_df["id"], genres_df["name"]))




In [14]:
type(df.loc[0, "genres"])


str

In [None]:
df["genres"] = df["genres"].apply(ast.literal_eval)
df["genres_all"] = df["genres_all"].apply(ast.literal_eval)


In [16]:
def ids_to_names(id_list):
    return [id_to_name.get(i, "UNKNOWN") for i in id_list]


In [17]:
df["genres_names"] = df["genres"].apply(ids_to_names)
df["genres_all_names"] = df["genres_all"].apply(ids_to_names)


In [18]:
df[["genre_top", "genres_names", "genres_all_names"]].head()


Unnamed: 0,genre_top,genres_names,genres_all_names
0,Hip-Hop,[Hip-Hop],[Hip-Hop]
1,Hip-Hop,[Hip-Hop],[Hip-Hop]
2,Hip-Hop,[Hip-Hop],[Hip-Hop]
3,Hip-Hop,[Hip-Hop],[Hip-Hop]
4,Hip-Hop,[Hip-Hop],[Hip-Hop]


In [19]:
df.to_csv("genres_named.csv", index=False) #merge de df mais avec les noms de genre

# Task 1 - Predict the original genre
## A multi-class classification problem. Try to reach the best performance level and also explain possible issue


In [None]:
y = LabelEncoder().fit_transform(df["genre_top"])
X = df.drop(columns=["genre_top", "track_id"])
X = X.select_dtypes(include=[np.number]) #include=["number"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

"""
On peut utiliser:
Logistic Regression (multi-class)
Random Forest
Gradient Boosting
XGBoost / LightGBM
SVM
kNN
Decision trees
Neural Network
Deep Learning
"""
# Logistic Regression
model = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=1000,
    class_weight="balanced"   # utile si classes déséquilibrées
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)

# Random Forest
rf = RandomForestClassifier(
    n_estimators=300,      # nombre d’arbres
    max_depth=None,       # profondeur (None = libre)
    random_state=42,
    n_jobs=-1,            # utilise tous les cœurs
    class_weight="balanced"   # utile si classes déséquilibrées
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
importances = pd.Series(rf.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False).head(20))

# Grandient boosting
gb = GradientBoostingClassifier(
    n_estimators=200,    # nombre d’arbres
    learning_rate=0.1,  # taux d’apprentissage
    max_depth=3,        # profondeur des arbres de base
    random_state=42
)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# XGBoost
xgb = XGBClassifier(
    objective="multi:softmax",     # multi-classes
    num_class=len(set(y_train)),   # nombre de classes
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42
)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


#LightGBM
lgb = LGBMClassifier(
    objective="multiclass",
    num_class=len(set(y_train)),
    n_estimators=300,
    learning_rate=0.1,
    max_depth=-1,
    num_leaves=64,
    class_weight="balanced",
    random_state=42
)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# SVM
svm = SVC(
    kernel="rbf",        # non linéaire, souvent le plus performant
    C=1.0,               # régularisation
    gamma="scale",       # paramètre du noyau
    class_weight="balanced"
)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# kNN

knn = KNeighborsClassifier(
    n_neighbors=10,     # nombre de voisins
    weights="distance", # pondère les voisins proches
    metric="minkowski"  # distance euclidienne par défaut
)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

scores = []
for k in [3, 5, 7, 10, 15, 20]:
    knn = KNeighborsClassifier(n_neighbors=k, weights="distance")
    knn.fit(X_train, y_train)
    scores.append((k, accuracy_score(y_test, knn.predict(X_test))))

print(scores)


# Decision trees

dt = DecisionTreeClassifier(
    criterion="gini",      # ou "entropy"
    max_depth=None,        # ou un entier pour limiter la profondeur
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight="balanced",   # utile si déséquilibre
    random_state=42
)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

importances = pd.Series(dt.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False).head(20))

# Neural Network

mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64),  # 2 couches cachées
    activation="relu",
    solver="adam",
    batch_size=64,
    max_iter=300,
    random_state=42
)

mlp.fit(X_train, y_train) #X_train_std
y_pred = mlp.predict(X_test) #X_test_std
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Deep Learning
n_features = X_train.shape[1] #X_train_std
n_classes = len(np.unique(y_train))

model = models.Sequential([
    layers.Input(shape=(n_features,)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(n_classes, activation="softmax")
])
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)
early_stop = tf.keras.callbacks.EarlyStopping(
    patience=10,
    restore_best_weights=True,
    monitor="val_loss"
)

history = model.fit(
    X_train, y_train, #X_train_std
    validation_split=0.2,   # 20% du train utilisé pour la validation
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)
test_loss, test_acc = model.evaluate(X_test, y_test) #X_test_std
print("Test accuracy (deep learning):", test_acc)

y_proba = model.predict(X_test) #X_test_std
y_pred = y_proba.argmax(axis=1)

print(classification_report(y_test, y_pred))
