In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Dados e Aprendizagem Automática  
### Introdução

## SVM using GenAudio Dataset  

For this class, we will use a music dataset. The [GenAudio dataset](#) is composed by audio files and CSV files with extracted features from the audio files:

- **genres original** - collection of 10 genres with 100 audio files each, all having a length of 30 seconds.
- **CSV files** - extracted features of the audio files. One file has for each song (30 seconds long) a mean and variance computed over multiple features that can be extracted from an audio file. The other file has the same structure, but the songs were split before into 3 seconds audio files (this way increasing 10 times the amount of data we fuel into our classification models).

This dataset is frequently used for evaluation in machine listening research for Music Genre Recognition (MGR). The files were collected in 2000–2001 from a variety of sources including personal CDs, radio, microphone recordings, in order to represent a variety of recording.

---

### Imports, installations and settings

In order to work with audio data, we will use **librosa**, a Python library used for audio and music analysis. It is a powerful package widely used for audio visualization and for building Music Information Retrieval (MIR) systems.

Install librosa:  


In [2]:
hipp_train = pd.read_csv('train_radiomics_hipocamp.csv')
hipp_test = pd.read_csv('test_radiomics_hipocamp.csv')

In [None]:
# Remover colunas com apenas um valor e colunas irrelevantes
colunas_remover = ['Image', 'diagnostics_Image-original_Hash', 'diagnostics_Mask-original_Hash',
                   'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_CenterOfMassIndex',
                   'diagnostics_Mask-original_CenterOfMass', 'Mask']
hipp_train_c = hipp_train.drop(columns=colunas_remover, errors='ignore').loc[:, hipp_train.nunique() > 1]
hipp_test_c = hipp_test.drop(columns=colunas_remover, errors='ignore').loc[:, hipp_test.nunique() > 1]

In [None]:
# Mapear a coluna 'Transition' para valores numéricos
mapping = {
    'CN-CN': 0,  # Estado Normal
    'CN-MCI': 1,  # Estado Intermediário
    'MCI-MCI': 2,  # Estado Intermediário
    'MCI-AD': 3,  # Demência
    'AD-AD': 4    # Demência
}
hipp_train_c['Transition'] = hipp_train_c['Transition'].map(mapping)

# Remover qualquer entrada com valores nulos na coluna 'Transition' após o mapeamento
hipp_train_c.dropna(subset=['Transition'], inplace=True)

# Preparar dados para treinamento
X_train = hipp_train_c.drop(['Transition', 'ID'], axis=1, errors='ignore')
y_train = hipp_train_c['Transition']

In [None]:
from sklearn.model_selection import train_test_split


X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size=0.2, random_state=2022)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


pg_model = RandomForestClassifier(bootstrap=True, max_depth=20, random_state=2022)
# pg_model = XGBClassifier(max_depth=20, random_state=2022)
# pg_model = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1.0, gamma='scale', random_state=2021))

# Train the model
pg_model.fit(X_train, y_train)

# Predict on the test set
y_pred = pg_model.predict(X_test_split)


pg_model.fit(X_train_split, y_train_split)

# Fazer previsões no conjunto de teste
predictions = pg_model.predict(X_test_split)

# Exibir o classification report
print(classification_report(y_test_split, predictions, target_names=list(mapping.keys())))

# Fazer previsões no conjunto de teste final
X_test = hipp_test_c.drop(columns=['ID'], errors='ignore')

In [None]:
# Dicionário inverso para mapear de volta para os rótulos originais
inverse_mapping = {1: 'CN-CN', 2: 'CN-MCI', 3: 'MCI-MCI', 4: 'MCI-AD', 5: 'AD-AD'}
predictions_mapped = pd.Series(pg_model.predict(X_test)).map(inverse_mapping)

# Criar o DataFrame para a submissão
submission_df = pd.DataFrame({
    'RowId': range(1, len(predictions_mapped) + 1), 
    'Result': predictions_mapped
})


if len(submission_df) < 100:
    print("Aviso: O conjunto de teste contém menos de 100 entradas. Submissão terá apenas", len(submission_df), "previsões.")
else:
    print("Número total de previsões:", len(submission_df))

# Guardar as alterações num ficheiro
submission_df.to_csv('submission.csv', index=False)
print("Submissão salva com sucesso com exatamente", len(submission_df), "previsões.")