<a href="https://colab.research.google.com/github/KlaidasKaralevicius/NLP_lab2/blob/main/lab2_Klaidas_Karalevicius.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [1]:
import re
import numpy as np
import random
import keras
!pip install markovify -q
import markovify
!pip install num2words -q
from num2words import num2words
!pip install pronouncing -q
import pronouncing
from nltk.stem import PorterStemmer

keras.backend.clear_session()
keras.utils.set_random_seed(3645)
random.seed(3645)

# 1. Įkelti dainų tekstus

Pakeitimas nr.1, įkeliami savo duomenys susidarantys iš apie 70 dainų ir 124676 simbolių.

In [2]:
!wget https://raw.githubusercontent.com/KlaidasKaralevicius/NLP_lab2/refs/heads/main/Arknights.txt

artist_file = 'Arknights.txt'
with open(artist_file, 'r') as f:
  lyrics = f.read()

# 2. Duomenų normalizavimas

Normalizuojant duomenis papildomai išfiltruojami elementai tarp "[ ]" kartu su šiais skliausteliais. Išfiltruojami tokie elementai kaip: [x2], [Chorus], [Verse], [Pre-chorus]...

In [3]:
def normalize_word_line(line: str) -> list[str]:
  line = re.sub(r"\[.*?\]", "", line)
  row = [x.lower() for x in re.findall(r"\w+'?\w*", line)]
  new_row = []
  for word in row:
    numbers = re.findall(r"\d+", word)
    for n in numbers:
      word = word.replace(n, num2words(int(n)))
    new_row.append(word)
  return new_row

# 3. Markovo grandinės

In [4]:
markov_model = markovify.NewlineText(lyrics)

# 4. Ritmo ištraukimas

In [5]:
def n_syllables(word_line: list[str]):
    vowels = 'aeiouy'
    syllable_count = 0

    for word in word_line:
        for i, char in enumerate(word):
            if char in vowels:
                if (i == 0) or (word[i-1] not in vowels):
                    syllable_count += 1
        word_vowels_count = sum([x in vowels for x in word])
        if word_vowels_count == 0:
            syllable_count = len(word)
        elif word_vowels_count > 1 and (word[-1] == 'e') and (word[-2] not in vowels):
            syllable_count -= 1

    return syllable_count

In [6]:
def get_rhyme(line: list[str]) -> str:
    last_word = re.sub('\W+', '', line[-1])
    all_rhymes = pronouncing.rhymes(last_word)
    if all_rhymes:
        rhyming_ends = [x[-2:] for x in all_rhymes]
        most_common_rhyme = max(set(rhyming_ends), key = rhyming_ends.count)
    else:
        most_common_rhyme = last_word[-2:]
    return most_common_rhyme


def get_rhyme_list(normalized_lyrics: list[list[str]]):
  rhyme_set = set()
  for row in normalized_lyrics:
    most_common_rhyme = get_rhyme(row)
    rhyme_set.add(most_common_rhyme)

  sorted_rhyme_set = sorted(list(rhyme_set), key = lambda x: x[-1])
  return sorted_rhyme_set

lyrics = [normalize_word_line(x) for x in lyrics.splitlines()]
lyrics = [x for x in lyrics if x]

Pakeitimas nr.2, naudojant NLTK biblioteką mokymo tekstui panaudojama stemming - randamos žodžių šaknys, todėl panašūs žodžiai (pavyzždiui bėga, bėgo, bėgti) skaitomi kaip tas pats žodis.

Papildomai randama maksimalus skiemenų kiekis, kad get_line_features skiemenų kiekis būtų ne integer, o float nuo 0 iki 1.

In [7]:
max_syll = max(n_syllables(line) for line in lyrics)
stemmer = PorterStemmer()
lyrics = [[stemmer.stem(word) for word in lyrics] for lyrics in lyrics]
rhymes = get_rhyme_list(lyrics)

# 5. Duomenų rinkinio paruošimas

In [8]:
def get_rhyme_float(line: list[str], rhyme_list: list[str]) -> float | None:
  rhyme = get_rhyme(line)
  if rhyme in rhyme_list:
    return rhyme_list.index(rhyme) / len(rhyme_list)
  else:
    return None

In [9]:
def get_random_lines(markov_model, n_rows: int) -> list[list[str]]:
  lines = []
  last_words = []

  while len(lines) < n_rows:
    line = markov_model.make_sentence(max_overlap_ratio = .49, tries = 100)
    if (line is not None) and (line not in lines):
      last_word = normalize_word_line(line)[-1]
      if last_words.count(last_word) < 3:
        lines.append(normalize_word_line(line))
        last_words.append(last_word)

  return lines

def get_line_features(line: list[str], rhyme_list: list[str]) -> tuple:
  return (line, n_syllables(line)/max_syll, get_rhyme_float(line, rhyme_list))

In [10]:
def build_dataset(lines: list[list[str]], rhyme_list: list[str]):
	features = [get_line_features(x, rhyme_list) for x in lines]
	x_data, y_data = [], []

	for i in range(len(features) - 3):
		line1, line2 = features[i    ][1:], features[i + 1][1:]
		line3, line4 = features[i + 2][1:], features[i + 3][1:]
		x_data.append(np.array([line1, line2]))
		y_data.append(np.array([line3, line4]))
	return np.array(x_data), np.array(y_data)

dataset = build_dataset(lyrics, rhymes)

# 6. RNN modelio inicializavimas

Pakeitimas nr.3, LTSM sluoksniai pakeičiami į GRU, pridedami dropout sluokniai ir pakeičiami neuronų kiekiai.

In [12]:
def create_gru(depth: int):

  model = keras.Sequential(name = 'GRU-based_lyrics_generator')
  model.add(keras.layers.Input((2, 2)))
  model.add(keras.layers.GRU(6, return_sequences = True))
  model.add(keras.layers.Dropout(0.3))
  for i in range(depth):
    model.add(keras.layers.GRU(8, return_sequences = True))
    model.add(keras.layers.Dropout(0.5))
  model.add(keras.layers.GRU(2, return_sequences = True))

  model.compile(optimizer = keras.optimizers.RMSprop(learning_rate = 0.001),
                loss = 'mse')

  return model

model = create_gru(depth = 1)
model.summary()

In [13]:
def compose(starting_input: np.ndarray, rnn_model, n_line_groups: int):
	prev_vectors = starting_input
	final_vectors = []
	for i in range(n_line_groups):
		final_vectors.append(rnn_model.predict(prev_vectors).flatten().reshape(1, 2, 2))
		prev_vectors = final_vectors[-1]
	return final_vectors


def last_word_compare(prev_lines: list[list[str]], new_line: list[str], penalty: float = 0.2) -> float:
	sum_penalty = 0.0
	for line in prev_lines:
		if line[-1] == new_line[-1]:
			sum_penalty += penalty
	return sum_penalty


def calculate_score(features, n_syllables, rhyme, penalty: float, rhyme_list, maxsyllables):
	if maxsyllables > max_syll:
		maxsyllables = max_syll
	desired_n_syllables = features[0] * (maxsyllables / max_syll)
	desired_rhyme = features[1] * len(rhyme_list)
	if rhyme is None:
		rhyme = 0.0
	syllable_score = - abs(float(desired_n_syllables) - float(n_syllables))
	rhyme_score = abs(float(desired_rhyme) - float(rhyme))
	score = 1.0 + syllable_score + rhyme_score - penalty
	return score


def vectors_into_song(vectors, generated_lyrics, rhyme_list, maxsyllables: int):
	song = []
	generated_features = [get_line_features(x, rhyme_list) for x in generated_lyrics]

	vector_halves = []
	for vector in vectors:
		vector_halves.extend(vector[0].tolist())

	for vector in vector_halves:
		scorelist = []

		for (line, n_syllables, rhyme) in generated_features:
			if len(song) != 0:
				penalty = last_word_compare(song, line)
			else:
				penalty = 0

			total_score = calculate_score(vector, n_syllables, rhyme, penalty, rhyme_list, maxsyllables)
			scorelist.append([line, total_score])

		best_line_index = np.argmax([float(x[1]) for x in scorelist])
		best_line = scorelist[best_line_index][0]
		song.append(best_line)

		generated_features = [x for x in generated_features if x[0] != best_line]

	return [' '.join(x) for x in song]

# 7. Modelio apmokymas

In [14]:
x_data, y_data = dataset

model.fit(
    x_data, y_data,
    batch_size = 2,
    epochs = 4
)

Epoch 1/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 5ms/step - loss: 0.0798
Epoch 2/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 0.0515
Epoch 3/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - loss: 0.0508
Epoch 4/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - loss: 0.0504


<keras.src.callbacks.history.History at 0x79d951dad6f0>

# 8. Modelio testavimas

In [18]:
start_i = np.random.choice(range(len(x_data)))
start = np.array([x_data[start_i]])
vectors = compose(start, model, 6)
some_lyrics = get_random_lines(markov_model, 80)
vectors_into_song(vectors, some_lyrics, rhymes, maxsyllables = 8)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


['let your heart on my soul',
 "you won't be in this silence",
 'even if the sky on a tight rope',
 "was to find a way i'm still upright",
 "now that it's not so nice to me",
 "i'm one with the flick of the night i'll find",
 'you would choose to face with the high price',
 'constellations come to be in vain',
 "won their respect back in where i'm born to lead",
 "cause i've laid it out in the depths of my code",
 "i'm the one i'm here and i'm turning",
 'voices from the day that i could and i promise you sunrise']