# Levenshtein con palabras

Este enfoque ocupa la idea de Levenshtein pero con palabras en lugar de carácteres es decir que buscará dentro de cada video los subtítulos que cambien menos con la query

In [1]:
import numpy as np
from collections import defaultdict
import pickle
from tqdm import tqdm
import pandas as pd
import sys
sys.path.append('..')
from src.utils import *
from src.data import Data 

[nltk_data] Downloading package punkt to /Users/valencia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valencia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/valencia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/valencia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cargamos el corpus tokenizado y limpio

In [2]:
videos = Data("../pkl/clean_videos.pkl")
videos.corpus[0][0]

{'id': 'L9YhoRatRzE',
 'original_title': 'Siempre Fui Yo | Adelanto | Disney+',
 'subtitles': [{'start': '0.13',
   'dur': '3.77',
   'text': ['tu', 'papá', 'tuvo', 'un', 'accidente']},
  {'start': '12.5', 'dur': '5.939', 'text': ['te', 'recuerdo', 'que', 'está']},
  {'start': '15.59', 'dur': '5.339', 'text': ['aquí']},
  {'start': '18.439',
   'dur': '6.361',
   'text': ['estaba', 'como', 'rabioso', 'con']},
  {'start': '20.929',
   'dur': '7.65',
   'text': ['especial', 'con', 'lucas', 'martín']},
  {'start': '24.8',
   'dur': '3.779',
   'text': ['necesito', 'saber', 'qué', 'fue', 'lo', 'que', 'pasó']},
  {'start': '29.42', 'dur': '2.479', 'text': ['aplausos']}]}

In [3]:
def findIx(video, ix):
    """
    video: video en el que se buscara
    ix: indice de la palabra que buscamos
    regresa el subtitulo y su indice de la palabra buscada
    """
    subtitles = video["subtitles"]
    r = -1
    while ix>0:
        r += 1
        ix -= len(subtitles[r]["text"])
    return (r, subtitles[r])

In [4]:
videos.get_all_subtitles()
videos.all_subtitles[0]

100%|█████████████████████████████████████████| 9/9 [00:00<00:00, 25.00it/s]


['tu', 'papá', 'tuvo', 'un', 'accidente']

In [5]:
def getVideosDict():
    """
    regresa un diccionario donde el id de un video es su llave
    """
    return {v['id']:v for v in flatten(videos.corpus)}
videosDict = getVideosDict()

In [6]:
def printDistances(distances, token1Length, token2Length):
    """
    Función auxiliar para imprimir la matriz de las distancias de Levenshtein
    """
    for t1 in range(token1Length + 1):
        for t2 in range(token2Length + 1):
            print(int(distances[t1][t2]), end=" ")
        print()

In [7]:
def faux_levenshtein(q, d, verbose=False):
    """
    q: query a buscar
    d: string en la que buscamos
    verbose: si la función debe imprimir la matrix
    faux leveshtein ejecuta leveshtein ejecuta leveshtein con matrix, pero toda la columna 
    del espacio de busqueda es inicalizada con 0 lo que nos permite encontrar coincidencias
    como subcandena y comparadas desde el inico.
    Regresa la distancia minima de leveshtein entre nuestra query y una subcadena de d y el 
    indice de dicha subcadena.
    """
    distances = np.zeros((len(q) + 1, len(d) + 1))
    for t1 in range(len(q) + 1):
        distances[t1][0] = t1
    a = 0
    b = 0
    c = 0
    
    for t1 in range(1, len(q) + 1):
        for t2 in range(1, len(d) + 1):
            if (q[t1-1] == d[t2-1]):
                distances[t1][t2] = distances[t1 - 1][t2 - 1]
            else:
                a = distances[t1][t2 - 1]
                b = distances[t1 - 1][t2]
                c = distances[t1 - 1][t2 - 1]
                if (a <= b and a <= c):
                    distances[t1][t2] = a + 1
                elif (b <= a and b <= c):
                    distances[t1][t2] = b + 1
                else:
                    distances[t1][t2] = c + 1
    if(verbose):
        printDistances(distances, len(q), len(d))
    
    min_arg = np.argmin(distances[len(q)])
    return (distances[len(q)][min_arg], min_arg-len(q))

In [8]:
def search_faux_levenshtein(q):
    """
    q: arreglo de cadenas que queremos encontrar en el corpues
    regresa un dataframe ordenado por su destancia de faux levenshtein
    """
    indexes = []
    minvals = []
    for (k,d) in tqdm(videos.documents.items()):
        minval, mix =faux_levenshtein(q, d)
        indexes.append(mix)
        minvals.append(minval)
    df = pd.DataFrame(data={"id":videos.documents.keys(), 'faux leveshtein':minvals, 'fl index':indexes})
    return df.sort_values(by=['faux leveshtein'])

In [9]:
def search(q):
    """
    q: cadena que queremos encontrar en el corpus
    regresa un generador que podemos iterar para ir encontrando los resultados de la busqueda
    """
    def iterator(top):
        for _,serie in top.iterrows():
            ix = serie['fl index']
            video = videosDict[serie["id"]]
            subtitle_id, subtitle = findIx(video, ix)
            next_sub = None
            if (subtitle_id+1) < len(video["subtitles"]):
                next_sub = video["subtitles"][subtitle_id+1]["text"]
            yield {
                "id":serie["id"], 
                "subtitle": subtitle, 
                "next_subtitle": next_sub,
                "subtitle_id":subtitle_id, 
                'faux leveshtein':serie['faux leveshtein'] 
            }
    q = q.split(" ")
    top = search_faux_levenshtein(q)
    return iterator(top)

In [11]:
r = search("grande y gordo")

100%|███████████████████████████████████████| 3278/3278 [00:15<00:00, 205.98it/s]


In [12]:
next(r)

{'id': '6PDVRDv-nik',
 'subtitle': {'start': '872.71',
  'dur': '4.14',
  'text': ['veis', 'el', 'error', 'no', 'fue', 'ni', 'mucho', 'menos', 'tan']},
 'next_subtitle': ['grande',
  'y',
  'eso',
  'teniendo',
  'en',
  'cuenta',
  'que',
  'las'],
 'subtitle_id': 382,
 'faux leveshtein': 1.0}

In [10]:
def top5(query):
    fl = search("grande y gordo")
    return [next(fl)["subtitle"]["text"] for i in range(5)]

In [11]:
with open("../pkl/test.pkl", "rb") as f:
    test_all = pickle.load(f)

len(test_all)

4535

In [19]:
for ix, q in enumerate(test_all[1814*2:]):
    if ix%9==0:
        pickle.dump(resp, open("../pkl/faux_levenshtein.pkl", "wb"))
        print(f"salvando {ix} elementos")
    resp.append(top5(q))

salvando 0 elementos


100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 225.70it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 223.27it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 228.22it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 230.41it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 226.35it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 224.29it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 225.91it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 219.01it/s]
100%|██████████████████████████████████████████| 3278/3278 [00:14<00:00, 228.41it/s]


salvando 9 elementos


 13%|█████▍                                     | 418/3278 [00:01<00:12, 222.25it/s]


KeyboardInterrupt: 

In [16]:
1814
resp = []