## Import Librairies

In [1]:
import os 
import random
import pandas as pd
import numpy as np
import ast
import datetime

import matplotlib.pyplot as plt

from functools import partial

from tqdm import tqdm 

tqdm.pandas()

In [2]:
!pip install tensorboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [4]:
import tensorflow as tf
from tensorflow.keras.callbacks import LambdaCallback, EarlyStopping,ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.optimizers import RMSprop, Adam

## Set up drive

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
DRIVEWORKSPACE_PATH = "/content/gdrive/Shareddrives/ING3 (2022-23) Mlamali/10 • Deep Learning/PROJET DL/AIRapFR - ProjetDL"
MODELS_DIR_PATH = f"{DRIVEWORKSPACE_PATH}/ai/models"
LSTM_MODELS_DIR_PATH = f"{MODELS_DIR_PATH}/lstm"

os.path.exists(MODELS_DIR_PATH)

False

In [7]:
import sys
sys.path.append(DRIVEWORKSPACE_PATH)

In [8]:
from utils import create_dir

## Data

In [9]:
from data.load_corpus import CorpusDataManager

corpus_dmng = CorpusDataManager()
corpus_dmng

<data.load_corpus.CorpusDataManager at 0x7f10df1d0a30>

### load 1 artist corpus

In [10]:
df_corpus_keryjames_preprocessed = corpus_dmng.get_df_lyrics_preprocessed_by_name("keryjames")
df_corpus_keryjames_preprocessed

Loading /content/gdrive/Shareddrives/ING3 (2022-23) Mlamali/10 • Deep Learning/PROJET DL/AIRapFR - ProjetDL/data/datasets/genius-1273-keryjames/df_lyrics_preprocessed_tok_crop_.csv


Unnamed: 0,artist,primary_artist.id,lyrics,id,title,release_date_components.year,artist_names,featured_artists,language
0,Kery James,1273,"['mesdames', ',', 'messieurs', ',', 'les', 'pa...",212896,Animalement vôtre,1999.0,"Kery James (Ft. Hamed Däye, Rocca & Shurik'n)","[{'api_path': '/artists/43765', 'header_image_...",fr
1,Kery James,1273,"['mes', 'intentions', 'sont', 'bonnes', 'mais'...",319537,Les frères ne savent pas,2000.0,Kery James,[],fr
2,Kery James,1273,"[""j'"", 'observe', 'ce', 'qui', 'se', 'passe', ...",422500,Ce “A” d’avilissant,2001.0,Kery James,[],fr
3,Kery James,1273,"['""', 'il', 'faut', 'cessez', 'le', 'feu', '!'...",3232542,Cessez le feu!,2001.0,Kery James,[],fr
4,Kery James,1273,"['la', 'plupart', 'de', 'mes', 'amis', 'sont',...",2438741,C’qui nous perd,2001.0,"Kery James (Ft. AP du 113, Demon One, Dry, Jes...","[{'api_path': '/artists/2406499', 'header_imag...",fr
...,...,...,...,...,...,...,...,...,...
153,Kery James,1273,"['we', 'should', 'clean', 'you', 'up', 'by', '...",2834826,Racailles English translation,,Kery James,[],en
154,Kery James,1273,"['les', 'rappeurs', 'racontent', 'des', 'histo...",102532,Réel {Remix},,Kery James (Ft. Leck & Sadek),"[{'api_path': '/artists/15237', 'header_image_...",fr
155,Kery James,1273,"['ils', 'parlent', 'de', 'nous', '\n', ""qu'"", ...",2238180,Thug Life - Live,,Kery James,[],fr
156,Kery James,1273,"['le', 'morceau', 'qui', 'vient', ',', 'il', '...",103141,Vent d’État (version live),,Kery James,[],fr


### load rap corpus

In [None]:
df_corpus_preprocessed = corpus_dmng.get_full_df_lyrics_corpus(preprocessed=True, only_french_artist=True)
df_corpus_preprocessed.head(10)

Loading 262 csv files


 95%|█████████▍| 248/262 [04:09<00:14,  1.02s/it]

In [None]:
df_corpus_preprocessed.info()

In [None]:
df_corpus_preprocessed["release_date_components.year"] = df_corpus_preprocessed["release_date_components.year"].where(df_corpus_preprocessed["release_date_components.year"] >= 1800, np.nan)

df_corpus_preprocessed["release_date_components.year"].describe()

In [None]:
df_corpus_preprocessed["release_date_components.year"].isna().sum()

In [None]:
# Tri par artiste et album.name
df_corpus_preprocessed = df_corpus_preprocessed.sort_values(by=["artist","artist_names", "album.name","release_date_components.year"]).reset_index(drop=True)

# Remplacer les valeurs NaN de la colonne "year" par la valeur de la même colonne du même artiste
df_corpus_preprocessed["release_date_components.year"] = df_corpus_preprocessed["release_date_components.year"].fillna(method="ffill", limit=1)

In [None]:
df_corpus_preprocessed["release_date_components.year"].plot.hist()
df_corpus_preprocessed["release_date_components.year"].describe()

In [None]:
max_year_filter = 2008
df_corpus_preprocessed = df_corpus_preprocessed[df_corpus_preprocessed["release_date_components.year"] < max_year_filter].reset_index(drop=True)
df_corpus_preprocessed.shape

### Format

In [None]:
type(df_corpus_preprocessed["lyrics"][0])

In [None]:
df_corpus_preprocessed["lyrics"] = df_corpus_preprocessed["lyrics"].progress_apply(ast.literal_eval)
type(df_corpus_preprocessed["lyrics"][0])

### Tokens

In [None]:
full_corpus_tokens = df_corpus_preprocessed["lyrics"].explode().tolist()

for i in [random.randint(0,len(full_corpus_tokens)-1) for _ in range(5)]:
    print(full_corpus_tokens[i:i+10])

In [None]:
print('Corpus length in characters:', sum([len(token) for token in full_corpus_tokens]))
print('Corpus length in words:', len(full_corpus_tokens))

#### Word Frequency

In [None]:
table_tokens_value_counts = df_corpus_preprocessed["lyrics"].explode().value_counts()
table_tokens_value_counts

#### Ignore Words

Ignorer les mots les moins fréquents dans le processus de création d'un LSTM pour générer des textes de rap peut améliorer la qualité des résultats en réduisant le bruit dans les données d'entraînement. Les mots les plus rares sont souvent des erreurs de frappe, des mots spécifiques à une langue ou des mots qui n'ont aucun sens dans le contexte du texte. En les éliminant, on peut se concentrer sur les mots plus significatifs et pertinents pour la tâche de génération de textes de rap, ce qui peut conduire à des modèles plus précis et à des générations de textes plus cohérentes et de meilleure qualité.

In [None]:
MIN_WORD_FREQUENCY=3 # 450

table_tokens_value_counts[table_tokens_value_counts < MIN_WORD_FREQUENCY]

In [None]:
ignored_words = set(table_tokens_value_counts[table_tokens_value_counts < MIN_WORD_FREQUENCY].index.tolist())
len(ignored_words)

In [None]:
words = set(full_corpus_tokens)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(words - ignored_words)
print('Unique words after ignoring:', len(words))

### Indexation

In [None]:
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [None]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
SEQUENCE_LEN = 10
STEP = 1 # chaque séquence suivante sera décalée d'un seul mot par rapport à la séquence précédente.
sentences_dataset = []
next_words_dataset = []
ignored = 0
for i in tqdm(range(0, len(full_corpus_tokens) - SEQUENCE_LEN, STEP)):
    # Only add sequences where no word is in ignored_words
    if len(set(full_corpus_tokens[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences_dataset.append(full_corpus_tokens[i: i + SEQUENCE_LEN])
        next_words_dataset.append(full_corpus_tokens[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1

print()
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences_dataset))

In [None]:
len(sentences_dataset), len(next_words_dataset)

In [None]:
sentences_dataset[2], next_words_dataset[2]

### Padding

### Shuffle

In [None]:
def split_training_and_test_set(sentences_original, next_original, percentage_test=0.1,seed=123):
    # shuffle the data in unison
    np.random.seed(seed)
    shuffled_indices = np.random.permutation(len(sentences_original))
    sentences_shuffled = [sentences_original[i] for i in shuffled_indices]
    next_words_shuffled = [next_original[i] for i in shuffled_indices]

    # split the data into training and test sets
    cut_index = int(len(sentences_original) * percentage_test)
    x_test,x_train  = sentences_shuffled[:cut_index], sentences_shuffled[cut_index:]
    y_test,y_train  = next_words_shuffled[:cut_index], next_words_shuffled[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)

(sentences_train, next_words_train), (sentences_test, next_words_test) = split_training_and_test_set(sentences_dataset, next_words_dataset)  

sentences_train[1], next_words_train[1]

todo : faire fonction script qui générer train_set et set set auto

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)    

## Model Building


### ...

La fonction `generator` sert à générer des données pour entraîner le modèle réseau de neurones LSTM. Elle prend en entrée des listes de phrases et de mots suivants, ainsi qu'une taille de batch, et renvoie des entrées et sorties pour l'entraînement.

In [None]:
def data_generator(sentence_list, next_word_list, batch_size, sequence_length, words_dict):
    index = 0
    while True:
        x = np.zeros((batch_size, sequence_length, len(words_dict)), dtype=bool)
        y = np.zeros((batch_size, len(words_dict)), dtype=bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, words_dict[w]] = 1
            y[i, words_dict[next_word_list[index]]] = 1

            index = (index + 1) % len(sentence_list)
        yield x, y

partial_data_generator = partial(data_generator,sequence_length=SEQUENCE_LEN,words_dict=word_indices)

for x,y in partial_data_generator(sentences_train, next_words_train, batch_size=128):
    print(x.shape)
    print(y.shape)
    break

### Model 1 : LSTM

Building the LSTM Model

In [None]:
model_lstm_1 = Sequential(name="model_lstm_1")
model_lstm_1.add(LSTM(128, input_shape=(SEQUENCE_LEN, len(words))))
model_lstm_1.add(Dense(len(words)))
model_lstm_1.add(Activation('softmax'))
model_lstm_1.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01), metrics=['accuracy'])
model_lstm_1.summary()

In [None]:
word_indices['\n']

In [None]:
SEQUENCE_LEN

In [None]:
def generate_text(model, starting_words,window_size, words_indices, indices_words, max_length=50, diversity=1):
    assert max_length > len(starting_words),"max_length must > len(starting_words)"
    length = max_length - len(starting_words)
    
    generated = starting_words
    sentence = starting_words[-window_size:]
    for i in range(length):
        x_pred = np.zeros((1, window_size, len(words)))
        #print(sentence)
        for t, word in enumerate(sentence):
            x_pred[0, t, words_indices[word]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        
        next_index = sample(preds, diversity)
        next_word = indices_words[next_index]
        #print(generated)
        generated.append(next_word)
        sentence = sentence[-(window_size-1):] + [next_word]
    return generated

generate_text(model_lstm_1, ['je',"suis"],window_size=SEQUENCE_LEN, words_indices=word_indices, indices_words=indices_word, diversity=1.0, max_length=50)

In [None]:
def on_epoch_end(epoch, logs, model, sequence_len, words_indices, indices_words, max_diversity=1.0, max_length=10,epochs_looked = [0,25,50]):
    if epoch in epochs_looked:
        print(f'\n----- Generating text after Epoch: {epoch}')
        for diversity in np.linspace(0.2, max_diversity, 3):
            print(f'----- Diversity: {diversity}')
            generated_text = generate_text(model, ['je',"suis"],window_size=sequence_len, words_indices=word_indices, indices_words=indices_word, diversity=diversity, max_length=max_length)
            
            print(' '.join(generated_text))

"""
def on_epoch_end_orig(epoch, logs):
    print()
    if(epoch > 45):
        print('----- Generating text after Epoch: %d\n' % epoch)
        for diversity in [0.2, 0.5, 1.0]:
            
            print('----- Diversity:', diversity, ' -----')
            generated = ['je','suis', 'né']
            sentence = generated
            for i in range(100):
                
                x_pred = np.zeros((1, SEQUENCE_LEN, len(words)))
                for t, word in enumerate(sentence):
                    x_pred[0, t, word_indices[word]] = 1.
        
                preds = model_lstm_1.predict(x_pred, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_word = indices_word[next_index]
                generated.append(next_word)
                sentence = sentence[1:] + [next_word]
                sys.stdout.write(next_word)
                sys.stdout.flush()
                
            print()

"""


#### Pre-training 

on fr rappers corpus

In [None]:
batch_size = 128
epochs = 50

##### checkpoint

In [None]:
weights_file = LSTM_MODELS_DIR_PATH + "/pre-trained/" + f"{model_lstm_1.name}.hdf5"
checkpoint = ModelCheckpoint(weights_file, monitor='val_loss',  save_best_only=True, save_weights_only=False) 

if os.path.exists(weights_file):
    model_lstm_1.load_weights(weights_file)
    print("Chargé les poids précédents")
else:
    print("Pas de poids précédents trouvés, entraînement à partir de zéro")
                                               

##### early stopping

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

##### print on epoch end

In [None]:
print_callback_orig = LambdaCallback(on_epoch_end=partial(on_epoch_end, model=model_lstm_1, sequence_len=SEQUENCE_LEN, words_indices=word_indices, indices_words=indices_word, max_diversity=1.0, max_length=50,epochs_looked = [0,5,10,25,50])
)    

##### tensorboard

In [None]:
!mkdir logs

In [None]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [None]:
%tensorboard --logdir logs

In [None]:
history_lstm_1 = model_lstm_1.fit(partial_data_generator(sentences_train, next_words_train, batch_size=batch_size),
                              steps_per_epoch=int(len(sentences_dataset)/batch_size) + 1,
                              epochs=50,
                              callbacks=[print_callback_orig,checkpoint,early_stopping,tensorboard_callback],
                              validation_data=partial_data_generator(sentences_test, next_words_test, batch_size=batch_size), 
                              validation_steps=int(len(sentences_test)/batch_size) + 1)


In [None]:
def plot_learning_curves(history, title=""):
    acc      = history.history["accuracy"]
    loss     = history.history["loss"]
    val_acc  = history.history["val_accuracy"]
    val_loss = history.history["val_loss"]
    epochs = range(len(acc))

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))
    
    fig.suptitle(title, fontsize="x-large")
    
    ax1.plot(epochs, acc, label="Entraînement")
    ax1.plot(epochs, val_acc, label="Validation")
    ax1.set_title("Accuracy - Données entraînement vs. validation.")
    ax1.set_ylabel("Accuracy (%)")
    ax1.set_xlabel("Epoch")
    ax1.legend()
    
    ax2.plot(epochs, loss, label="Entraînement")
    ax2.plot(epochs, val_loss, label="Validation")
    ax2.set_title("Perte - Données entraînement vs. validation.")
    ax2.set_ylabel('Perte')
    ax2.set_xlabel('Epoch')
    ax2.legend()

    fig.show()

plot_learning_curves(history_lstm_1, title="Modèle LSTM (simple)")

##### save

In [None]:
weights_file = LSTM_MODELS_DIR_PATH + "/pre-trained/" + f"{model_lstm_1.name}.hdf5"
model_lstm_1.save(weights_file)
del model_lstm_1

##### load and testing

In [None]:
weights_file = LSTM_MODELS_DIR_PATH + "/pre-trained/" + f"{model_lstm_1.name}.hdf5"
model_lstm_1 = load_model(weights_file)

<keras.engine.sequential.Sequential at 0x7f36a6338700>

----- Generating text -----

----- Diversity: 0.2  -----

le rap vaincu baisser bouffe mieux pleines jungle pavé détenu succès innocence meurt permet mange rg perm tempête tieks vivra système aies knight prenne effacer kicker lopes salles étouffe rebeu comoco horizon décor reup enterré miami anciens clés gorge descendre dépensé pouvait chiennes soin souffre dégomme favori part si rôle dédicace testament

----- Diversity: 0.5  -----

le rap piles échelle réagit hi bingo décolle choisir verras renards mia geyser alerte péchés meurtres cramé monté rumeurs dise tension magie début sacré échappe impasse pouvoir original g.a.v z incarne feux baiser tendre payes vi ailleurs hosto arriver défoncer real hit solides foutre leçons valide ramadan rêve peureux appeler pince préfères

----- Diversity: 1.0  -----

le rap traquent shut rivaliser bus 24 { dépôt écrase mal-être conforme émissions ouverts profond tentations enfoiré sauter celles ha cjd fenêtres us affaires garçons bizz en inch' couille p

#### Fine-tuning
 on 1 artist corpus 

In [None]:
# Charger les poids du modèle précédemment entraîné
model.load_weights("path/to/pre-trained/weights.h5")

# Compiler à nouveau le modèle
model.compile(optimizer=RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Entraîner le modèle sur les données de l'artiste particulier
history = model.fit(artist_data_generator,
                    steps_per_epoch=int(len(artist_sentences)/batch_size) + 1,
                    epochs=50,
                    callbacks=[early_stopping],
                    validation_data=artist_data_generator,
                    validation_steps=int(len(artist_sentences_test)/batch_size) + 1)