<a href="https://colab.research.google.com/github/HamdanXI/nlp_adventure/blob/main/daily_dialog_gloss_Oct19_adj_adv_verb_noun_capital.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset
import re
import spacy

dataset = load_dataset("HamdanXI/cleaned_daily_dialog_sentence")
nlp = spacy.load("en_core_web_sm")

def remove_spaces_before_punctuation(example):
    example['dialogue'] = re.sub(r'\s([?.!"](?:\s|$))', r'\1', example['dialogue'])
    return example

def add_space_after_full_stop(example):
    # Add space after full stop if there isn't one already
    example['dialogue'] = re.sub(r'(?<=[.?!])(?=[^\s])', r' ', example['dialogue'])
    return example

def replace_first_person_pronouns(example):
    # Define a pattern for first-person pronouns (case insensitive)
    pattern = r'\b(I|me|my|mine|myself|we|us|our|ours|ourselves)\b'
    example['dialogue'] = re.sub(pattern, "PRO.1", example['dialogue'], flags=re.IGNORECASE)
    return example

def replace_second_person_pronouns(example):
    # Define a pattern for second-person pronouns (case insensitive)
    pattern = r'\b(you|your|yours|yourself)\b'
    example['dialogue'] = re.sub(pattern, "PRO.2", example['dialogue'], flags=re.IGNORECASE)
    return example

def replace_third_person_pronouns(example):
    pattern = r'\b(he|him|his|she|her|hers|it|its|they|them|their|theirs|themselves)\b'
    example['dialogue'] = re.sub(pattern, "PRO.3", example['dialogue'], flags=re.IGNORECASE)
    return example

def replace_question_mark(example):
    example['dialogue'] = re.sub(r'\?', ' QM-W ', example['dialogue'])
    return example

def extracted_adj_adv_n_v(text):
    doc = nlp(text)
    extracted_tokens = [token.text for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV'] or token.text in ['PRO.1', 'PRO.2', 'PRO.3']]

    # Append 'QM-W' if it appears in the original text
    if 'QM-W' in doc.text:
        extracted_tokens.append('QM-W')

    return ' '.join(extracted_tokens)

# Use the map function to remove the dialogue feature
def remove_dialogue(example):
    return {
        "extracted_adj_adv_n_v": example["extracted_adj_adv_n_v"]
    }

# Apply the transformation
dataset_1 = dataset.map(remove_spaces_before_punctuation)
dataset_2 = dataset_1.map(add_space_after_full_stop)
dataset_3 = dataset_2.map(replace_question_mark)
dataset_4 = dataset_3.map(replace_first_person_pronouns)
dataset_5 = dataset_4.map(replace_second_person_pronouns)
dataset_6 = dataset_5.map(replace_third_person_pronouns)
dataset_7 = dataset_6.map(lambda example: {'extracted_adj_adv_n_v': extracted_adj_adv_n_v(example['dialogue'])})
dataset_7["train"] = dataset_7["train"].map(remove_dialogue, remove_columns=['dialogue'])

# To view the changes, you can print out the first few entries
print(dataset_1['train'][:5])
print(dataset_2['train'][:5])
print(dataset_3['train'][:5])
print(dataset_4['train'][:5])
print(dataset_5['train'][:5])
print(dataset_6['train'][:5])
print(dataset_7['train'][:5])

Map:   0%|          | 0/77350 [00:00<?, ? examples/s]

Map:   0%|          | 0/77350 [00:00<?, ? examples/s]

{'dialogue': ['Say , Jim , how about going for a few beers after dinner?', 'You know that is tempting but is really not good for our fitness.', 'What do you mean? It will help us to relax.', "Do you really think so? I don't. It will just make us fat and act silly. Remember last time?", "I guess you are right.But what shall we do? I don't feel like sitting at home."]}
{'dialogue': ['Say , Jim , how about going for a few beers after dinner?', 'You know that is tempting but is really not good for our fitness.', 'What do you mean? It will help us to relax.', "Do you really think so? I don't. It will just make us fat and act silly. Remember last time?", "I guess you are right. But what shall we do? I don't feel like sitting at home."]}
{'dialogue': ['Say , Jim , how about going for a few beers after dinner QM-W ', 'You know that is tempting but is really not good for our fitness.', 'What do you mean QM-W  It will help us to relax.', "Do you really think so QM-W  I don't. It will just make u

In [3]:
dataset_7

DatasetDict({
    train: Dataset({
        features: ['extracted_adj_adv_n_v'],
        num_rows: 77350
    })
})

In [8]:
def merge_datasets(example, idx, dataset_2=dataset_2):
    example['text'] = dataset_2['train'][idx]['dialogue']
    example['gloss'] = example['extracted_adj_adv_n_v']
    return example

merged_dataset = dataset_7.map(merge_datasets, with_indices=True)

# Step 3: Remove the old 'dialogue' column
merged_dataset = merged_dataset.remove_columns(["extracted_adj_adv_n_v"])

# Your dataset should now have the desired structure
print(merged_dataset['train'][:5])

{'text': ['Say , Jim , how about going for a few beers after dinner?', 'You know that is tempting but is really not good for our fitness.', 'What do you mean? It will help us to relax.', "Do you really think so? I don't. It will just make us fat and act silly. Remember last time?", "I guess you are right. But what shall we do? I don't feel like sitting at home."], 'gloss': ['Say going few beers dinner QM-W', 'PRO.2 know tempting really good PRO.1 fitness', 'PRO.2 mean PRO.3 help PRO.1 relax QM-W', 'PRO.2 really think so PRO.1 do PRO.3 just make PRO.1 fat act silly Remember last time W QM-W', 'PRO.1 guess PRO.2 right PRO.1 do PRO.1 feel sitting home QM-W']}


In [5]:
def capitalize_feature(example):
    example["gloss"] = example["gloss"].upper()
    return example

merged_dataset["train"] = merged_dataset["train"].map(capitalize_feature)

print(merged_dataset['train'][:5])

Map:   0%|          | 0/77350 [00:00<?, ? examples/s]

{'text': ['Say , Jim , how about going for a few beers after dinner?', 'You know that is tempting but is really not good for our fitness.', 'What do you mean? It will help us to relax.', "Do you really think so? I don't. It will just make us fat and act silly. Remember last time?", "I guess you are right. But what shall we do? I don't feel like sitting at home."], 'gloss': ['SAY GOING FEW BEERS DINNER QM-W', 'PRO.2 KNOW TEMPTING REALLY GOOD PRO.1 FITNESS', 'PRO.2 MEAN PRO.3 HELP PRO.1 RELAX QM-W', 'PRO.2 REALLY THINK SO PRO.1 DO PRO.3 JUST MAKE PRO.1 FAT ACT SILLY REMEMBER LAST TIME W QM-W', 'PRO.1 GUESS PRO.2 RIGHT PRO.1 DO PRO.1 FEEL SITTING HOME QM-W']}


In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /roo

In [9]:
merged_dataset.push_to_hub("daily_dialog_gloss_Oct19_adj_adv_verb_noun")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/78 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/505 [00:00<?, ?B/s]