### Folder, modules and data import

In [1]:
# Accedemos a nuestras carpetas en drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/7.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m4.8/7.1 MB[0m [31m68.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.1/7.1 MB[0m [31m86.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.2


In [3]:
import os
import pathlib
import pandas as pd
import re
from spellchecker import SpellChecker

from collections import Counter

Specify the path for the json containing the scraped lyrics, the export path for the cleaned lyrics to be used at the end of the nb and the path for the words in the Spanish dictionary.

In [4]:
current_path = pathlib.Path().absolute()
lyrics_path = str(current_path) + '/drive/MyDrive/Lenguaje Natural/Letra narcocorridos/Letras_json_10-03/lyrics_expanded2.json'
clean_lyrics_path = str(current_path) + '/drive/MyDrive/Lenguaje Natural/Letra narcocorridos/Letras_json_10-03/clean_lyrics_extended.csv'
spanish_dict_path = str(current_path) + '/drive/MyDrive/Lenguaje Natural/Letra narcocorridos/Letras_json_10-03/spanish_dict.txt'

In [5]:
with open(spanish_dict_path, encoding='utf8') as f:
  spanish_dictionary = f.read() #IMport the words in the Spanish dictionary
  spanish_dictionary = spanish_dictionary.split('\n')

Import the data in the lyrics json into a pandas df

In [6]:
songs_df = pd.read_json(lyrics_path, orient='records')
songs_df = songs_df.dropna()
# Transform from single-element-list to string for titles and artist columns
songs_df['title'] = [l[0] for l in songs_df['title']]
songs_df['artist'] = [l[0] for l in songs_df['artist']]

In [7]:
songs_df.size

9330

In [8]:
songs_df['title']

Unnamed: 0,title
0,A Caballo
1,A Cada Instante
2,A Capelinha do Arraiá
3,A Casa Caiu 2017
4,A Casa Caiu
...,...
3105,Ámame O Déjame Ya
3106,Ángel de Virtudes
3107,ÁNGEL part Romeo Santos
3108,Èl No Eres Tù


In [9]:
songs_df['lyrics'] = songs_df['lyrics'].apply(lambda x: x[0] if isinstance(x, list) else x)

# Data cleaning

## Removing artist insertion and punctuation elements

Many lyrics have insertions like: (artist_name) or [artist_name] indicating the singer switch between sentences, which are not actually lyrics. After removing them, the punctuation signs will be removed since they are interpretations from the person uploading the lyrics. Only the end of question (?) will be left as an objective sign (i.e. amount of questions in a song could be a valid feature).

In [10]:
import re
import pandas as pd

def remove_artist_insertion(lyrics: pd.Series, artists: pd.Series, spanish_dict: list):
    for index in range(len(lyrics)):
        dirty_text = lyrics.iloc[index]  # Obtener la letra original
        artist_name = artists.iloc[index]  # Obtener el nombre del artista

        print(f"Procesando artista: {artist_name}")  # Depuración

        # Construir una expresión regular para eliminar la estructura completa [artista] o (artista)
        re_str = rf"[\[\(]{re.escape(artist_name)}[\]\)]"
        print(f"Expresión regular: {re_str}")  # Ver la expresión generada

        # Reemplazar la coincidencia completa en el texto
        dirty_text = re.sub(re_str.lower(), '', dirty_text.lower(), flags=re.IGNORECASE).strip()

        # Guardar la letra corregida en el DataFrame
        lyrics.iloc[index] = dirty_text

    return lyrics

songs_df['lyrics'] = remove_artist_insertion(songs_df['lyrics'], songs_df['artist'], spanish_dictionary)

def remove_special_signs(lyrics:pd.Series):
  signs_correction = [(',', ''), ('¿', ''), ('?', ' ? '), ('¡', ''), ('!',''),
                    ('(', ''), (')', ''), ('[', ''), (']', ''), ('-', ''),
                    ('.', ''), ('.', ''), (":", ""), (";", ""), ("_", ""), ('...', '')]
  for replacement in signs_correction:
    lyrics = [l.replace(replacement[0], replacement[1]) for l in lyrics]
  return lyrics

songs_df['lyrics'] = remove_special_signs(songs_df['lyrics'])

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
Procesando artista: La Morocha
Expresión regular: [\[\(]La\ Morocha[\]\)]
Procesando artista: Banda Todo Terreno
Expresión regular: [\[\(]Banda\ Todo\ Terreno[\]\)]
Procesando artista: Kevin Ortiz
Expresión regular: [\[\(]Kevin\ Ortiz[\]\)]
Procesando artista: Morro da Crioula
Expresión regular: [\[\(]Morro\ da\ Crioula[\]\)]
Procesando artista: Corridos Mexicanos
Expresión regular: [\[\(]Corridos\ Mexicanos[\]\)]
Procesando artista: Los Jilgueros Del Pico Real
Expresión regular: [\[\(]Los\ Jilgueros\ Del\ Pico\ Real[\]\)]
Procesando artista: Los Jilgueros Del Pico Real
Expresión regular: [\[\(]Los\ Jilgueros\ Del\ Pico\ Real[\]\)]
Procesando artista: Corridos Mexicanos
Expresión regular: [\[\(]Corridos\ Mexicanos[\]\)]
Procesando artista: Wences Romo
Expresión regular: [\[\(]Wences\ Romo[\]\)]
Procesando artista: Corridos Mexicanos
Expresión regular: [\[\(]Corridos\ Mexicanos[\]\)]
Procesando artista: Corridos 

We can see the result of this preliminar cleaning:

In [11]:
songs_df.head(10)

Unnamed: 0,title,artist,lyrics
0,A Caballo,José Manuel Figueroa,soy de a caballo señores\nmas no vengo a presu...
1,A Cada Instante,Leandro Ríos,a cada\ninstante\ntengo recuerdos\ntuyos\na ca...
2,A Capelinha do Arraiá,Rolando Boldrin,na minha terra a capelinha\né simples pequenin...
3,A Casa Caiu 2017,Boi Garantido,a cunhã tu despachou\ntua sinhazinha cansou\ne...
4,A Casa Caiu,Boi Garantido,olha contrário\ntua história é de marmota derr...
5,A Cascavé,Rolando Boldrin,tuda vez que ele ia na estação\ne arrecebia ca...
6,A Chama Azul,Boi Caprichoso,a chama azul é a mais completa\nsão quarenta e...
7,A Chegada do Meu Boi 2,Boi Caprichoso,trago a magia do lugar\na força da cultura des...
8,A Coisa Mais Linda do Mundo,Boi Garantido,a coisa mais linda do mundo\né quando no mês d...
9,A Conquista,Boi Garantido,um dia chegou nessa terra um conquistador\nman...


## Spelling correction

The next step is to try to reduce as much as possible the amount of misspelled words. This is a complex task due to several reasons:
* Slang and geographical variations not recognized in the
reference dictionaries are common.
* A mix of terms in English are widely used, sometimes with semantic variation from the original English word.
Slang and English words are left untouched since they might be significant features.

A feedback process has been manually performed to identify the most common mispellings.These have been manually incorporated to the function *refine_spelling* by the tuned_spelling list to be corrected. A subseequent correction is also performed by the candidate proposed by the SpellChecker.correction() method for those words ending with a "'", since it was verified manually that the accuracy rate of the candidate proposed for this misspelling was quite high.

In [12]:
def refine_spelling(lyrics: pd.Series):
    spelling_check = SpellChecker(language='es')  # Usar diccionario en español

    tuned_spelling = [
        ("pa’", "para"), ("to'", "todo"), ("vamo'", "vamos"), ("e'", "es"),
        ("na'", "nada"), ("pa'", "para"), ("lo'", "los"), ("to'a", "toda"),
        ("ere'", "eres"), ("quiere'", "quieres"), ("pa'l", "para el"),
        ("ma'", "más"), ("po'", "por"), ("tiene'", "tienes"), ("la'", "las"),
        ("sabe'", "sabes"), ("va'", "vas"), ("estamo'", "estamos"),
        ("hacemo'", "hacemos"), ("somo'", "somos"), ("está'", "estás"),
        ("no'", "nos"), ("haga'", "hagas"), ("llama'", "llamas"),
        ("yo'", "yo"), ("la'o", "lado"), ("perriarte", "perrear"),
        ("toa", "toda"), ("amigo'", "amigos"), ("claro'", "claro"),
        ("partío'", "partido"), ("má'", "más"), ("mojaíta", "mojada"),
        ("to’", "todo"), ("oí'te", "oíste"), ("beso'", "beso"),
        ("nosotro'", "nosotros"), ("rompe'", "romper"), ("dize", "dice"),
        ("dio'", "dios"), ("vece'", "veces"), ("ve'", "ves"),
        ("cosa'", "cosas"), ("pa'lante", "para adelante"),
        ("sabe’", "sabes"), ("perrearte", "perrear"), ("perriando", "perreando"),
        ("hora'", "horas"), ("diga'", "digas"), ("pá'", "papá"),
        ("toa'", "toda"), ("despué'", "después"), ("andamo'", "andamos"),
        ("pasa'o", "pasado"), ("entonce'", "entonces"),("pa", "para"),
        ("ay", " "),("oh", " "),("qe", "que"),("yah", " "),("uh", " "),
        ("ah", " "),("tatara", " "),("parara", " "),("zaza", " "),("-ta'", " ")
    ]

    # Aplicar correcciones manuales
    for replacement in tuned_spelling:
        lyrics = lyrics.str.replace(replacement[0], replacement[1], regex=False)

    adhoc_dict = {}  # Diccionario para almacenar correcciones manuales

    # Recorrer cada línea de la columna 'lyrics'
    for index in range(len(lyrics)):
        for word in re.findall(r"(\w*)\'\B", lyrics.iloc[index]):
            word = word + "'"  # Restaurar el apóstrofe en la palabra

            if word not in adhoc_dict:
                correction = spelling_check.correction(word)

                # 🔹 Si la corrección es None, usar la palabra original
                if correction is None:
                    correction = word  # Mantiene la palabra sin cambios

                lyrics.iloc[index] = re.sub(re.escape(word), correction, lyrics.iloc[index])
                adhoc_dict[word] = correction
            else:
                correction = adhoc_dict[word]
                lyrics.iloc[index] = re.sub(re.escape(word), correction, lyrics.iloc[index])

    return lyrics, adhoc_dict

# Aplicar la función en el DataFrame
songs_df['lyrics'], words_corrected = refine_spelling(songs_df['lyrics'])

As an interesting side detail, the dictionary of corrected words is shown hereunder. The automatic correction was not perfect in some cases, but the overall accuracy is high.

In [13]:
print('The amount of words corrected was: {0}'.format(len(words_corrected)))

The amount of words corrected was: 132


In [14]:
words_corrected

{"cumêr'": 'cumbre',
 "l'": 'la',
 "paral'": 'paral',
 "hal'": 'halo',
 "lao'": 'laos',
 "todita'": 'todito',
 "ubi'": 'ubio',
 "r'": 're',
 "plebi'": 'plebe',
 "cami'": 'camia',
 "mari'": 'morir',
 "pieda'": 'piedad',
 "'": "'",
 "seguimo'": 'seguido',
 "escucha'": 'escucha',
 "negro'": 'negro',
 "dejas'": 'dejar',
 "m'": 'me',
 "vert'": 'ver',
 "apararezca'": "apararezca'",
 "esta'": 'esta',
 "terrorista'": 'terrorista',
 "ta'": 'tan',
 "botellita'": "botellita'",
 "rista'": 'lista',
 "juntin'": 'juncina',
 "trai'": 'trata',
 "lambo'": 'ambos',
 "atra'": 'otra',
 "estuvimo'": "estuvimo'",
 "cora'": 'coral',
 "cruisin'": "cruisin'",
 "ol'": 'ola',
 "gata'": 'gata',
 "otra'": 'otra',
 "quiera'": 'quieta',
 "tiramo'": 'tirado',
 "vai'": 'vale',
 "escuchamo'": 'escuchador',
 "maquillá'": "maquillá'",
 "buscando'": 'buscador',
 "acá'": 'acá',
 "salvavida'": 'salvavidas',
 "mon'": 'mono',
 "fumemo'": 'fumero',
 "hombro'": 'hombro',
 "sabes'": 'saber',
 "cel'": 'celo',
 "tryin'": 'trina',
 

As a final sanity check, we can peak into the non-recognised words still present in the lyrics, as well as their occurence count. This can give a quick intuition of the *dirtiness* of our lyrics dataset. The candidate proposed by SpellChecker.correction() method is also added, to give an intuition of how many of these words could be corrected with a drastic automatic correction of all the non-recognised terms.

In [None]:
def count_ocurrences(df, spanish_dict_path=None):
    spelling_check = SpellChecker(language='es')  # Usar diccionario en español

    # Cargar diccionario adicional si se proporciona
    if spanish_dict_path:
        spelling_check.word_frequency.load_text_file(spanish_dict_path)

    word_count = {}  # Diccionario para almacenar la frecuencia de palabras

    # Recorrer cada fila de la columna 'lyrics'
    for index, lyrics in df["lyrics"].items():
        words = lyrics.lower().split()  # Convertir a minúsculas y dividir en palabras
        for word in words:
            word_count[word] = word_count.get(word, 0) + 1  # ✅ Incrementar conteo correctamente

    # Ordenar el diccionario de mayor a menor frecuencia
    word_count = dict(sorted(word_count.items(), key=lambda item: item[1], reverse=True))

    print("Frecuencia de palabras:", word_count)  # ✅ Verificar resultados intermedios

    no_dict_words = {}  # Diccionario para palabras no reconocidas

    # Identificar palabras fuera del diccionario
    for word, count in word_count.items():
        if spelling_check.unknown([word]):  # Si la palabra no está en el diccionario
            if count > 3:
                dict_correction = spelling_check.correction(word)  # Sugerir corrección
            else:
                dict_correction = 'unknown'  # Marcar como desconocida
            no_dict_words[word] = {'count': count, 'Dict_candidate': dict_correction}

    return no_dict_words  # ✅ Devolver palabras no reconocidas

# Llamar a la función y almacenar resultados
no_dict_word = count_ocurrences(songs_df, spanish_dict_path)

Frecuencia de palabras: {'que': 28431, 'y': 18187, 'de': 16295, 'no': 16051, 'me': 15818, 'a': 13637, 'la': 13081, 'te': 11051, 'el': 11045, 'mi': 9310, 'en': 9183, 'se': 6991, 'por': 6824, 'lo': 6694, 'yo': 6662, 'tu': 6165, 'un': 5927, 'es': 5642, 'si': 5170, 'con': 5121, 'parara': 4971, 'ya': 4642, 'amor': 4341, 'los': 3828, 'las': 3174, 'como': 3075, 'pero': 2991, 'una': 2884, 'quiero': 2875, 'le': 2702, 'al': 2664, 'ti': 2568, 'tú': 2549, 'más': 2526, 'vida': 2435, 'porque': 2389, 'todo': 2165, 'cuando': 2111, 'su': 2086, 'voy': 1979, 'sin': 1958, 'soy': 1819, 'solo': 1802, 'h': 1700, 'bien': 1691, 'siempre': 1681, 'mis': 1664, 'del': 1626, 'tus': 1593, 'qué': 1592, 'o': 1498, 'corazón': 1389, 'ser': 1376, 'tengo': 1374, 'estoy': 1353, 'muy': 1343, 'nada': 1330, '?': 1321, 'nunca': 1287, 'ni': 1270, 'aquí': 1253, 'mí': 1226, 'tan': 1210, 'eso': 1190, 'eres': 1185, 'ora': 1183, 'hoy': 1166, 'esta': 1164, 'nos': 1152, 'vez': 1150, 'sé': 1070, 'fue': 1049, 'va': 1031, 'aunque': 1029,

In [None]:
print('The amount of non-recognised words in the dataset is: {0}'.format(len(no_dict_word)))

The amount of non-recognised words in the dataset is: 19132


In [None]:
no_dict_word

{'parara': {'count': 4971, 'Dict_candidate': 'parar'},
 'quieres': {'count': 549, 'Dict_candidate': 'quienes'},
 'ando': {'count': 515, 'Dict_candidate': 'mando'},
 'estás': {'count': 505, 'Dict_candidate': 'estas'},
 'í': {'count': 498, 'Dict_candidate': 'a'},
 'sabes': {'count': 462, 'Dict_candidate': 'saber'},
 'corazon': {'count': 451, 'Dict_candidate': 'corazón'},
 'dime': {'count': 427, 'Dict_candidate': 'dije'},
 'é': {'count': 419, 'Dict_candidate': 'a'},
 'eu': {'count': 416, 'Dict_candidate': 'el'},
 'traigo': {'count': 391, 'Dict_candidate': 'trago'},
 'compara': {'count': 384, 'Dict_candidate': 'compra'},
 'paraso': {'count': 315, 'Dict_candidate': 'parado'},
 'nomás': {'count': 298, 'Dict_candidate': 'más'},
 'parasado': {'count': 269, 'Dict_candidate': 'pasado'},
 'quieras': {'count': 267, 'Dict_candidate': 'quiera'},
 'dia': {'count': 234, 'Dict_candidate': 'día'},
 'asi': {'count': 233, 'Dict_candidate': 'si'},
 'parasa': {'count': 214, 'Dict_candidate': 'parada'},
 'mu

As it can be seen below, many of the words still remaining are self-mentions of the artists to themselves (quite common in reggaeton), English terms, and representations of singing bits (i.e. uah, aah, ohohoh). One tempting idea would be to replace the self-mentions by a common token, representing a self-mentioning feature. However the cleaning of this elements will be left as a *might do* after the results of the model are obtained.

In [15]:
songs_df.to_csv(path_or_buf=clean_lyrics_path) #Export the clean set to a csv file