<a href="https://colab.research.google.com/github/KlaidasKaralevicius/NLP_lab2/blob/main/lab2_Klaidas_Karalevicius.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [1]:
import os, re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import random
from google.colab import files
import keras
!pip install markovify -q
import markovify
!pip install num2words -q
from num2words import num2words
!pip install pronouncing -q
import pronouncing

keras.backend.clear_session()
keras.utils.set_random_seed(226)
random.seed(226)

# 1. Įkelti dainų tekstus

Pakeitimas nr.1, įkeliami savo duomenys susidarantys iš apie 70 dainų ir 124676 simbolių.

In [14]:
#!wget https://raw.githubusercontent.com/KlaidasKaralevicius/NLP_lab2/refs/heads/main/Arknights.txt

artist_file = 'Arknights.txt'
with open(artist_file, 'r') as f:
  lyrics = f.read()

# 2. Duomenų normalizavimas

Normalizuojant duomenis papildomai išfiltruojami elementai tarp "[ ]" kartu su šiais skliausteliais. Išfiltruojami tokie elementai kaip: [x2], [Chorus], [Verse], [Pre-chorus]...

In [15]:
def normalize_word_line(line: str) -> list[str]:
  line = re.sub(r"\[.*?\]", "", line)
  row = [x.lower() for x in re.findall(r"\w+'?\w*", line)]
  new_row = []
  for word in row:
    numbers = re.findall(r"\d+", word)
    for n in numbers:
      word = word.replace(n, num2words(int(n)))
    new_row.append(word)
  return new_row

# 3. Duomenų apdorojimas (word-embedding)



# 3. Markovo grandinės

In [16]:
markov_model = markovify.NewlineText(lyrics)

# 4. Ritmo ištraukimas

In [17]:
def n_syllables(word_line: list[str]):
    vowels = 'aeiouy'
    syllable_count = 0

    for word in word_line:
        for i, char in enumerate(word):
            if char in vowels:
                if (i == 0) or (word[i-1] not in vowels):
                    syllable_count += 1
        word_vowels_count = sum([x in vowels for x in word])
        if word_vowels_count == 0:
            syllable_count = len(word)
        elif word_vowels_count > 1 and (word[-1] == 'e') and (word[-2] not in vowels):
            syllable_count -= 1

    return syllable_count

In [18]:
def get_rhyme(line: list[str]) -> str:
    last_word = re.sub('\W+', '', line[-1])
    all_rhymes = pronouncing.rhymes(last_word)
    if all_rhymes:
        rhyming_ends = [x[-2:] for x in all_rhymes]
        most_common_rhyme = max(set(rhyming_ends), key = rhyming_ends.count)
    else:
        most_common_rhyme = last_word[-2:]
    return most_common_rhyme


def get_rhyme_list(normalized_lyrics: list[list[str]]):
  rhyme_set = set()
  for row in normalized_lyrics:
    most_common_rhyme = get_rhyme(row)
    rhyme_set.add(most_common_rhyme)

  sorted_rhyme_set = sorted(list(rhyme_set), key = lambda x: x[-1])
  return sorted_rhyme_set

lyrics = [normalize_word_line(x) for x in lyrics.splitlines()]
lyrics = [x for x in lyrics if x]
rhymes = get_rhyme_list(lyrics)

# 5. Duomenų rinkinio paruošimas

In [19]:
def get_rhyme_float(line: list[str], rhyme_list: list[str]) -> float | None:
  rhyme = get_rhyme(line)
  if rhyme in rhyme_list:
    return rhyme_list.index(rhyme) / len(rhyme_list)
  else:
    return None

In [20]:
def get_random_lines(markov_model, n_rows: int) -> list[list[str]]:
  lines = []
  last_words = []

  while len(lines) < n_rows:
    line = markov_model.make_sentence(max_overlap_ratio = .49, tries = 100)
    if (line is not None) and (line not in lines):
      last_word = normalize_word_line(line)[-1]
      if last_words.count(last_word) < 3:
        lines.append(normalize_word_line(line))
        last_words.append(last_word)

  return lines

def get_line_features(line: list[str], rhyme_list: list[str]) -> tuple:
  return (line, n_syllables(line), get_rhyme_float(line, rhyme_list))

In [21]:
# test_lines = get_random_lines(markov_model, 2)
# for line in test_lines:
#   print(get_line_features(line, rhymes))

(['but', 'i', 'get', 'to', 'know', 'everything'], 9, 0.2925170068027211)
(['not', 'slowing', 'down', "won't", 'stop', 'the', 'search', 'for', 'the', 'thrill', 'of', 'it'], 13, 0.8163265306122449)


In [22]:
def build_dataset(lines: list[list[str]], rhyme_list: list[str]):
	features = [get_line_features(x, rhyme_list) for x in lines]
	x_data, y_data = [], []

	for i in range(len(features) - 3):
		line1, line2 = features[i    ][1:], features[i + 1][1:]
		line3, line4 = features[i + 2][1:], features[i + 3][1:]
		x_data.append(np.array([line1, line2]))
		y_data.append(np.array([line3, line4]))
	return np.array(x_data), np.array(y_data)

dataset = build_dataset(lyrics, rhymes)

In [23]:
# len(dataset[0]), len(dataset[1])
# dataset[0][0], dataset[1][0]

(array([[5.        , 0.17006803],
        [7.        , 0.17006803]]),
 array([[ 5.        ,  0.15646259],
        [11.        ,  0.23809524]]))

## 6. RNN modelio inicializavimas

Pakeitimas nr.3, LTSM sluoksniai pakeičiami į GRU, pridedami dropout sluokniai ir pakeičiami neuronų kiekiai.

In [24]:
def create_gru(depth: int):

  model = keras.Sequential(name = 'GRU-based_lyrics_generator')
  model.add(keras.layers.Input((2, 2)))
  model.add(keras.layers.GRU(6, return_sequences = True))
  model.add(keras.layers.Dropout(0.3))
  for i in range(depth):
    model.add(keras.layers.GRU(8, return_sequences = True))
    model.add(keras.layers.Dropout(0.5))
  model.add(keras.layers.GRU(2, return_sequences = True))

  model.compile(optimizer = keras.optimizers.RMSprop(learning_rate = 0.001),
                loss = 'mse')

  return model

model = create_gru(depth = 1)
model.summary()

In [25]:
def compose(starting_input: np.ndarray, rnn_model, n_line_groups: int):
	prev_vectors = starting_input
	final_vectors = []
	for i in range(n_line_groups):
		final_vectors.append(rnn_model.predict(prev_vectors).flatten().reshape(1, 2, 2))
		prev_vectors = final_vectors[-1]
	return final_vectors


def last_word_compare(prev_lines: list[list[str]], new_line: list[str], penalty: float = 0.2) -> float:
	sum_penalty = 0.0
	for line in prev_lines:
		if line[-1] == new_line[-1]:
			sum_penalty += penalty
	return sum_penalty


def calculate_score(features, n_syllables, rhyme, penalty: float, rhyme_list, maxsyllables):
	desired_n_syllables = features[0] * maxsyllables
	desired_rhyme = features[1] * len(rhyme_list)
	syllable_score = - abs(float(desired_n_syllables) - float(n_syllables))
	rhyme_score = abs(float(desired_rhyme) - float(rhyme))
	score = 1.0 + syllable_score + rhyme_score - penalty
	return score


def vectors_into_song(vectors, generated_lyrics, rhyme_list, maxsyllables: int):
	song = []
	generated_features = [get_line_features(x, rhyme_list) for x in generated_lyrics]

	vector_halves = []
	for vector in vectors:
		vector_halves.extend(vector[0].tolist())

	for vector in vector_halves:
		scorelist = []

		for (line, n_syllables, rhyme) in generated_features:
			if len(song) != 0:
				penalty = last_word_compare(song, line)
			else:
				penalty = 0

			total_score = calculate_score(vector, n_syllables, rhyme, penalty, rhyme_list, maxsyllables)
			scorelist.append([line, total_score])

		best_line_index = np.argmax([float(x[1]) for x in scorelist])
		best_line = scorelist[best_line_index][0]
		song.append(best_line)

		generated_features = [x for x in generated_features if x[0] != best_line]

	return [' '.join(x) for x in song]

In [26]:
# start = np.array([dataset[0][0]])
# vectors = compose(start, model, 4)
# some_lyrics = get_random_lines(markov_model, 20)
# vectors_into_song(vectors, some_lyrics, rhymes, maxsyllables=12)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 728ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


["don't think for a new day is mine",
 "run run run you're stuck in the grey",
 'and through your eyes and heal the damage',
 'i wanna be in your heart of the night',
 'oh did you see the fear in your heart speak up',
 "but i can't let it go let it dance again",
 "it's reflected in your heart in a second",
 'you were in the shallow we are all we need']

## 7. Modelio apmokymas

In [27]:
x_data, y_data = dataset

model.fit(
    x_data, y_data,
    batch_size = 2,
    epochs = 4
)

Epoch 1/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: 28.2541
Epoch 2/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - loss: 26.3490
Epoch 3/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 26.3424
Epoch 4/4
[1m1989/1989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 26.3411


<keras.src.callbacks.history.History at 0x7fbeb18d6ad0>

# 8. Modelio testavimas

In [28]:
start_i = np.random.choice(range(len(x_data)))
start = np.array([x_data[start_i]])
vectors = compose(start, model, 6)
some_lyrics = get_random_lines(markov_model, 40)
vectors_into_song(vectors, some_lyrics, rhymes, maxsyllables = 8)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


["break through the fire that's the worst plan that you speak of",
 'victoria for you to get it over too soon',
 'to keep you safe and hard to be on the beach now',
 'but i just wanna look at this world feels so unknown',
 "there's no point trying to find something to stand tall",
 'for the ones who will be the one who will rise up',
 'i wake up but the time is now or never',
 'you thought that i was full of hot air balloons',
 'go ahead and try to see the light of your era era',
 "see through a mirror feel the pain can't stop thinking about what if",
 'this could be the heat in your view',
 'color me and my own path']