# Ampliación de datos auxiliar

En este notebook se va a aplicar la técnica de ampliación de datos con las técnicas EDA y NLP albumentation a un conjunto de reseñas de Google Maps separadas en dos ficheros: uno con las reseñas que se van a considerar válidas y el otro con las inválidas. Cada línea es una reseña nueva. Este notebook complementa al otro notebook con otras técicas para la ampliación de datos sobre las reseñas.


### Imports

In [82]:
import pandas as pd
import copy
import time
import unicodedata
import re
import random
from stop_words import get_stop_words
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\franp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\franp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Direcorio de datos

In [2]:
validReviewsSource = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\1. Data\\4. Labeled Reviews\\2. Without Emojis\\ValidReviews.txt"
invalidReviewsSource = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\1. Data\\4. Labeled Reviews\\2. Without Emojis\\InvalidReviews.txt"

## Funciones auxiliares
Aquí se definen unas funciones auxiliares para la ampliación del conjunto de datos

In [3]:
def importFromTxtToList(source):
    with open(source, 'r', encoding="utf-8") as file:
        #Generate a list with all the reviews
        targetList = [line.strip() for line in file]
    return targetList

In [4]:
#Read the file with the valid reviews
validReviewsList = importFromTxtToList(validReviewsSource)
#Read the file with the invalid reviews
invalidReviewsList = importFromTxtToList(invalidReviewsSource)

In [5]:
#Clean up the text removing punctuation, accent marks and convertin everything to lowercase
def cleanText(text):
    text = unicodedata.normalize('NFKD', text.lower()).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [6]:
#Clean up the text removing spanish stop words
def revomeSpanishStopWords(text):
    determinantes = {"el", "la", "los", "las", "un", "una", "unos", "unas", "este", "esta", "estos", "estas",
                 "ese", "esa", "esos", "esas", "aquel", "aquella", "aquellos", "aquellas", "mi", "mis",
                 "tu", "tus", "su", "sus", "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", 
                 "vuestra", "vuestros", "vuestras", "primer", "primero", "primera", "segundo", "segunda"}

    preposiciones = {"a", "ante", "bajo", "cabe", "con", "contra", "de", "desde", "durante", "en", "entre", 
                 "hacia", "hasta", "mediante", "para", "por", "según", "sin", "sobre", "tras", "versus", "vía"}

    conjunciones = {"y", "e", "ni", "o", "u", "pero", "sino", "sino que", "mas", "aunque", "que", "porque", 
                "como", "cuando", "donde", "mientras", "para que", "a fin de que", "puesto que", "ya que", 
                "si", "siempre que"}
    pronombres = {
        # Pronombres personales
        "yo", "tú", "vos", "él", "ella", "nosotros", "nosotras", 
        "vosotros", "vosotras", "ellos", "ellas", "usted", "ustedes",
        "me", "te", "lo", "la", "nos", "os", "los", "las", "le", "les", "se",
    
        # Pronombres posesivos
        "mío", "mía", "míos", "mías", 
        "tuyo", "tuya", "tuyos", "tuyas", 
        "suyo", "suya", "suyos", "suyas", 
        "nuestro", "nuestra", "nuestros", "nuestras", 
        "vuestro", "vuestra", "vuestros", "vuestras",
    
        # Pronombres demostrativos
        "este", "esta", "estos", "estas", 
        "ese", "esa", "esos", "esas", 
        "aquel", "aquella", "aquellos", "aquellas",
    
        # Pronombres relativos
        "que", "cual", "cuales", "quien", "quienes", 
        "cuyo", "cuya", "cuyos", "cuyas", "donde",
    
        # Pronombres interrogativos y exclamativos
        "qué", "quién", "quiénes", "cuál", "cuáles", 
        "cuánto", "cuánta", "cuántos", "cuántas", 
        "dónde", "cómo", "cuándo",
    
        # Pronombres indefinidos
        "alguien", "algo", "nadie", "nada", "cualquiera", 
        "todos", "todas", "varios", "varias", "muchos", 
        "muchas", "pocos", "pocas", "alguno", "alguna", 
        "algunos", "algunas", "ninguno", "ninguna", 
        "uno", "una", "unos", "unas", "demás"
    }

    #Combine all the words in one set
    spanishStopWords = determinantes | preposiciones | conjunciones | pronombres

    textWithoutStopWords = [word for word in text.split() if word.lower() not in spanishStopWords]

    return " ".join(textWithoutStopWords)

In [7]:
#add quotes to the text given
def add_quotes(text):
    return f'"{text}"'

In [71]:
#remove quotes from the text given
def remove_quotes(text):
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    return text

## Synonym Replacement
Este método consiste en elejir aleatoriamente n palabras de la re que no sean palabras vacías. Reemplaza cada una de estas palabras con uno de sus sinónimos elegido al azar.

In [9]:
#list of spanish stop words from the python library
stop_words = get_stop_words('spanish')

In [10]:
#Swaps the word given by its synonym
def swap_synonym(word):
    #gets all synonyms from the word given
    synset = wordnet.synsets(word, lang='spa')
    if synset:
        #if the word has one of more synonym we swap it
        synset = wordnet.synsets(word, lang='spa')[0]
        sinonimos = synset.lemma_names('spa') 
        limpios = [lemma.replace('_', ' ').strip() for lemma in sinonimos]
        #filter to make sure its a diferent word
        distintos = [s for s in limpios if s.lower() != word.lower()]
        #choose a random synonym if the word has one
        if distintos:
            elegido = random.choice(distintos)
            return elegido
        else:
            return word
    else:
        return word

In [11]:
def swapBySynonymLine(line):
    # Split the line into individual words
    words = line.split();
    new_words = []
    for word in words:
        # Check if the word is not a stop word
        if word not in stop_words: 
            # With 40% probability, replace the word with a synonym
            if random.random() <= 0.4:
                new_word = swap_synonym(word)
            else: 
                new_word = word
            new_words.append(new_word)
        else:
            new_words.append(word)
    # Join the words back into a single line and return it
    return ' '.join(new_words)

In [12]:
#synonym replacement method
def synonymReplacement(list):
    newList = []
    for line in list:
        newList.append(swapBySynonymLine(line))
    return newList

In [13]:
def fromListToTxt(file, path):
    with open(path, 'w', encoding='utf-8') as archivo:
        for line in file:
            archivo.write(line + '\n')

In [14]:
valid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\3. Synonym Replacement\\validSynonymsReviews.txt"
invalid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\3. Synonym Replacement\\invalidSynonymsReviews.txt"

validSynonymsReviews = synonymReplacement(validReviewsList).copy()
invalidSynonymsReviews = synonymReplacement(invalidReviewsList).copy()

fromListToTxt(validSynonymsReviews, valid_path)
fromListToTxt(invalidSynonymsReviews, invalid_path)

## Random Insertion
Este método consiste en encontrar un sinónimo aleatorio de una palabra aleatoria en la oración que no sea una palabra vacía e insertar ese sinónimo en una posición aleatoria de la oración.

In [15]:
#function to calculate the number of insertions or replacements or deletions for the methods
def calculateInsertions(line):
    return max(1, int(len(line.split()) * 0.1))

In [16]:
def wordInsertion(line):
    # Clean the line of text and remove extra spaces or special characters
    newLine = cleanText(line)
    # Split the cleaned line into words
    words = newLine.split()   
    # Remove Spanish stop words and split the result into words
    withoutStopWords = revomeSpanishStopWords(newLine).split() 
    # Proceed only if there are words left after removing stop words
    if withoutStopWords:
        # Determine the number of insertions based on line length
        for i in range(calculateInsertions(line)):
            # Choose a random important word from the list without stop words
            elegida = random.choice(withoutStopWords)    
            # Get a synonym of the chosen word
            sinonimo = swap_synonym(elegida) 
            # Choose a random position to insert the synonym
            pos = random.randint(0, len(words))    
            # Insert the synonym at the chosen position, removing any extra spaces
            words.insert(pos, sinonimo.strip())      
    # Return the modified line with quotation marks around it
    return add_quotes(' '.join(words))

In [17]:
#random insertion method
def randomInsertion(list):
    newList = []
    for line in list:
        newList.append(wordInsertion(line))
    return newList

In [18]:
valid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\4. Random Insertion\\validInsertionsReviews.txt"
invalid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\4. Random Insertion\\invalidInsertionsReviews.txt"

validRandomInsertionReviews = randomInsertion(validReviewsList).copy()
invalidRandomInsertionReviews = randomInsertion(invalidReviewsList).copy()

fromListToTxt(validRandomInsertionReviews, valid_path)
fromListToTxt(invalidRandomInsertionReviews, invalid_path)

## Random Swap
Este método consiste en cambiar dos palabras aleatoriamente en la frase n veces.

In [19]:
def posNotStopWord(words, pos):
    # Generate a random index within the range of the words list
    ret = random.randint(0, len(words) - 1)
    # Keep generating a new index until it is not equal to the provided position
    while ret == pos:
        ret = random.randint(0, len(words) - 1)   
    # Return the valid random index that is different from the provided position
    return ret

In [20]:
def wordSwap(line):
    # Clean the input line and split it into a list of words
    words = cleanText(line).split()
    # Check if there are more than one word to perform swapping
    if len(words) > 1:
        # Loop for the number of insertions calculated for the line
        for i in range(calculateInsertions(line)):
            # Get two diferent random numbers
            pos1 = posNotStopWord(words, -1)
            pos2 = posNotStopWord(words, pos1)
            # Swap the words at the two random positions
            aux = words[pos1]
            words[pos1] = words[pos2]
            words[pos2] = aux
    # Return the modified line with quotes added
    return add_quotes(' '.join(words))

In [21]:
#random swap method
def randomSwap(list):
    newList = []
    for line in list:
        newList.append(wordSwap(line))
    return newList

In [22]:
valid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\5. Random Swap\\validSwapReviews.txt"
invalid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\5. Random Swap\\invalidSwapReviews.txt"

validRandomSwapReviews = randomSwap(validReviewsList).copy()
invalidRandomSwapReviews = randomSwap(invalidReviewsList).copy()

fromListToTxt(validRandomSwapReviews, valid_path)
fromListToTxt(invalidRandomSwapReviews, invalid_path)

## Random Deletion
Este método consiste en eliminar una palabra aleatoria en la oración que no sea una palabra vacía.

In [23]:
def wordDelete(line):
    # Clean the input line and split it into a list of words
    words = cleanText(line).split()
    # Check if there are more than one word to delete from
    if len(words) > 1:
        # Calculate and performs the number of deletions based on the line's content
        for i in range(calculateInsertions(line)):
            # Generate a random index to select a word for deletion
            pos = random.randint(0, len(words) - 1)
            # Check if the selected word is not a stop word
            if words[pos] not in stop_words:
                # Remove the word at the selected position from the list
                words.pop(pos)
    # Join the remaining words into a string, add quotes, and return the result
    return add_quotes(' '.join(words))

In [24]:
def randomDeletion(list):
    newList = []
    for line in list:
        newList.append(wordDelete(line))
    return newList

In [25]:
valid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\6. Random Deletion\\validDeletionReviews.txt"
invalid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\6. Random Deletion\\invalidDeletionReviews.txt"

validRandomDeletionReviews = randomDeletion(validReviewsList).copy()
invalidRandomDeletionReviews = randomDeletion(invalidReviewsList).copy()

fromListToTxt(validRandomDeletionReviews, valid_path)
fromListToTxt(invalidRandomDeletionReviews, invalid_path)

## Mixed EDA Methods
Para intentar que haya la mayor de variaciones posibles se va a pasar las reseñas por todas las tecnicas de EDA realizadas anteriormente y de esta manera evitar un posible overfiting cuando se entrene al modelo con estos datos.

In [26]:
#performs all EDA transformations
def mixedEDAMethods(list):
    newList = synonymReplacement(list).copy()
    newList = randomInsertion(newList).copy()
    newList = randomSwap(newList).copy()
    newList = randomDeletion(newList).copy()
    return newList

In [72]:
valid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\7. Mixed EDA Methods\\validMixedReviews.txt"
invalid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\7. Mixed EDA Methods\\invalidMixedReviews.txt"

validMixedReviews = mixedEDAMethods(validReviewsList).copy()
invalidMixedReviews = mixedEDAMethods(invalidReviewsList).copy()

fromListToTxt(validMixedReviews, valid_path)
fromListToTxt(invalidMixedReviews, invalid_path)

## NLP Albumentation
Consiste en cambiar de orden las frases de la reseña en caso de que tengan más de una frase y eliminar las frases que sean iguales.

In [74]:
def getUniqueSentences(line):
    # Remove quotes from the input line to ensure clean processing
    newLine = remove_quotes(line)
    # Split the cleaned line into sentences using '.' as the delimiter and create a set comprehension to ensure unique sentences
    sentences = {sentence.strip() + "." for sentence in newLine.split('.') if sentence.strip()}
    # Return the set of unique sentences
    return sentences

In [79]:
def mixSentences(sentences):
    # Convert the input set of sentences into a list for shuffling
    newList = list(sentences)
    # Shuffle the list in place to randomize the order of sentences
    random.shuffle(newList)
    # Join the shuffled sentences into a single string, adding quotes around it
    return add_quotes(' '.join(newList))

In [80]:
#NLP albumentation method
def albumentation(list):
    newList = []
    for line in list:
        newList.append(mixSentences(getUniqueSentences(line)))
    return newList

In [81]:
valid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\8. NLP Albumentation\\validAlbumentationReviews.txt"
invalid_path = "C:\\Users\\franp\\OneDrive - Universidad Complutense de Madrid (UCM)\\Escritorio\\tfg\\TFG\\2. Review Classifier\\1. Data Augmentation\\8. NLP Albumentation\\invalidAlbumentationReviews.txt"

validAlbumentationReviews = albumentation(validReviewsList).copy()
invalidAlbumentationReviews = albumentation(invalidReviewsList).copy()

fromListToTxt(validAlbumentationReviews, valid_path)
fromListToTxt(invalidAlbumentationReviews, invalid_path)