# Segmentation as a filling task  (Masked Language Modeling)
https://huggingface.co/transformers/usage.html#language-modeling


---

# Imports, Dowloads and Etc

In [1]:
!jupyter nbextension enable --py widgetsnbextension


Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [95]:
import pandas as pd
from tqdm import tqdm

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")

2021-11-24 02:23:19.386443: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-24 02:23:19.386463: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


---
# Tatoeba Data
https://tatoeba.org/pt-br/

1. Dowload data: 

In [5]:
df = pd.read_csv('data/sentences.csv',sep='\t',header=None,)
df = df[df[1]=='eng']
df[3] = df[2].str.replace('[^\w\s\']',"").str.lower()

  df[3] = df[2].str.replace('[^\w\s\']',"").str.lower()


---
# Methods

In [87]:
def get_top_k_tokens(sentence, model, tokenizer, k=5):
    
    input = tokenizer.encode(sentence, return_tensors="pt",padding=True, truncation=True,max_length=500, add_special_tokens = True)
    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]

    token_logits = model(input)[0]
    mask_token_logits = token_logits[0, mask_token_index, :]

    top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()

    return top_k_tokens
        
def tokens_to_strings(tokens):
    return [tokenizer.decode([token]) for token in tokens]

In [74]:
def perform_punctuation(i, sent):
    part1 = sent.split(' ')[:i+1]
    part2 = sent.split(' ')[i+1:]
    part1[-1] = part1[-1]+'.'
    sent = " ".join(part1+part2)
    return sent

## Method for segmentation

In [96]:
def perform_segmentation(sent, model, tokenizer, k=5, puncts = ['.',',','-','!','?',':',';'], logic='any_punct' ):
    sent = f"{sent}"
    
    if len(sent.replace(' ',''))==0:
        return []
    
    if logic == 'any_punct': # if there is any punct token in the predicted tokens
        for i in tqdm(range(len(sent.split(' ')))):
            h = f" ".join(sent.split(' ')[:i+1] + [f"{tokenizer.mask_token}"] + sent.split(' ')[i+1:])
            possible_tokens = tokens_to_strings(get_top_k_tokens(h,model, tokenizer, k))

            for p in puncts:
                if p in possible_tokens:
                    sent = perform_punctuation(i,sent)
                    break
        segments = [s for s in sent.split('.') if len(s)>0]
        segments = [s if s[0] != ' ' else s[1:] for s in segments]
        return segments
        

### Exemple

In [97]:
sent = " ".join(df[3].str.lower()[:5])

perform_segmentation(sent,model,tokenizer)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:01<00:00, 22.71it/s]


["let's try something",
 'i have to go to sleep',
 'today is june 18th',
 "and it is muiriel's birthday",
 'muiriel is 20',
 'now',
 'the password is',
 'muiriel']

---
# Segmenting our data - Subtitles

### Reading

In [98]:
subtitles = pd.DataFrame()
for file in os.listdir("../data/00_raw/"):
    subtitles = subtitles.append({
        'id': file,
        'text_raw': open('../data/00_raw/'+file).read()
    }, ignore_index=True)

In [99]:
text = subtitles.text_raw
text = text.str.lower()
text = text.str.replace('\n','')
text = text.str.replace("[^\w\s\']+",'')
while text.str.contains('  ').any():
    text = text.str.replace("  ",' ')
subtitles['text'] = text

  text = text.str.replace("[^\w\s\']+",'')


In [102]:
print("Our sentence: ")
print(subtitles.iloc[0]['text'][:200], '[...]')
segments = perform_segmentation(subtitles.iloc[0]['text'][:1000],model,tokenizer)
segments

Our sentence: 
hey oh hey what's up bradley just want to take another look at you all right music wow i really need to shave hey guys carsten rehnquist year if you've been to any movie in the last few months you've  [...]


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 203/203 [00:55<00:00,  3.66it/s]


["hey oh hey what's up",
 'bradley just want to take another look at you all right',
 'music',
 'wow',
 'i really need to shave',
 'hey guys',
 "carsten rehnquist year if you've been to any movie in the last few months",
 "you've probably seen the trailer for a little movie called a star is born",
 'this film was been getting a lot of buzz and it stars lady',
 'gaga and bradley cooper',
 'who i believe also directed',
 "it it's a remake of the original 1977 film",
 "and it's been getting a lot of buzz like i just said",
 'so',
 'of course',
 'i had to see it now based on the trailer',
 "and the poster this really didn't seem like",
 'my type of thing',
 'it seemed more like a film for people that live in the suburbs',
 'who also have waller above their leather couches',
 'that says live laugh',
 'love and thing',
 'is this movie',
 'is not at all like that',
 "it's not at all what",
 'it was promoted',
 'as in my opinion',
 'it seems like a story',
 "that's all about this woman's rise"

## Observações

### Limitações

1. Demora muito
2. Limitação no numero maximo de tokens
3. Ao pontuar, não considera nunca mais mudar aquele ponto

### Notas

- É prciso remover tokens de musica e etc das legendas do yt
- Talvez valha a pena não considerar a virgula
- Como medir se é bom?