In [None]:
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing 
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.callbacks import EarlyStopping

from tabgan.sampler import GANGenerator, LLMGenerator

In [None]:
raw_data = pd.read_csv('../data/new_features.csv')

In [None]:
mean = lambda lst: sum(lst) / len(lst) if lst else float('nan')

In [None]:
raw_data['tempogram'] = raw_data['tempogram'].apply(lambda x: eval(x))
raw_data['poly_features'] = raw_data['poly_features'].apply(lambda x: eval(x))
raw_data['tonnetz'] = raw_data['tonnetz'].apply(lambda x: eval(x))

raw_data['tempogram'] = raw_data['tempogram'].apply(mean)
raw_data['poly_features'] = raw_data['poly_features'].apply(mean)
raw_data['tonnetz'] = raw_data['tonnetz'].apply(mean)

In [None]:
print(raw_data['label'].unique())

In [None]:
raw_data.head()

In [None]:
COLS_USED = raw_data.columns.to_list()
COLS_TRAIN = [col for col in COLS_USED if col != 'label']

In [None]:
label_encoder = preprocessing.LabelEncoder()
raw_data['label'] = label_encoder.fit_transform(raw_data['label'])

In [None]:
print(raw_data['label'].unique())

In [None]:
raw_data['poly_features'] = pd.to_numeric(raw_data['poly_features'], errors='coerce')
raw_data['tonnetz'] = pd.to_numeric(raw_data['tonnetz'], errors='coerce')
raw_data['tempogram'] = pd.to_numeric(raw_data['tempogram'], errors='coerce')

raw_data['poly_features'] = np.mean(raw_data['poly_features'])
raw_data['tonnetz'] = np.mean(raw_data['tonnetz'])
raw_data['tempogram'] = np.mean(raw_data['tempogram'])

In [None]:
raw_data = raw_data[COLS_USED]

df_x_train, df_x_test, df_y_train, df_y_test = train_test_split(
    raw_data.drop("label", axis=1),
    raw_data["label"],
    test_size=0.20,
    random_state=42,
)

df_x_test, df_y_test = df_x_test.reset_index(drop=True), \
  df_y_test.reset_index(drop=True)
df_y_train = pd.DataFrame(df_y_train)
df_y_test = pd.DataFrame(df_y_test)

x_train = df_x_train.values
y_train = df_y_train.values
x_test = df_x_test.values
y_test = df_y_test.values

In [None]:
model = Sequential()
model.add(Input(shape=(x_train.shape[1],)))  # Use Input layer instead of input_dim
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=5, verbose=1, mode='auto',
        restore_best_weights=True)
model.fit(x_train, y_train, validation_data=(x_test, y_test),
          callbacks=[monitor], verbose=2, epochs=1000, batch_size=50)

In [None]:
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

In [None]:
gen_x, gen_y = GANGenerator(
    gen_x_times=11.0,  # Aumentar o número de vezes que os dados são gerados
    cat_cols=None,
    bot_filter_quantile=0.00000001, top_filter_quantile=0.99999999,
    is_post_process=True,
    adversarial_model_params={
        "metrics": "rmse", "max_depth": 6, "max_bin": 400,  # Ajustar profundidade e bins
        "learning_rate": 0.001, "random_state": 42, "n_estimators": 1000, "verbosity":-1  # Ajustar taxa de aprendizado e número de estimadores
    },
    pregeneration_frac=2,  # Aumentar a fração de pré-geração
    only_generated_data=False
).generate_data_pipe(
    df_x_train, df_y_train,
    df_x_test, deep_copy=True, only_adversarial=False, 
    use_adversarial=True
)

In [None]:
pred = model.predict(gen_x.values)
score = np.sqrt(metrics.mean_squared_error(pred,gen_y.values))
print("Final score (RMSE): {}".format(score))

In [None]:
gen_x.shape

In [None]:
gen_x

In [None]:
data_augmented = pd.concat([raw_data, gen_x])

In [None]:
data_augmented.shape

In [None]:
data_augmented.duplicated().sum()

In [None]:
data_augmented.to_csv('../data/new_audio_data_genre_augmented.csv', index=False)