# Tugas 1 : Basic NLP Tools
Oleh : 
- Safiq Faray (13519145)

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = nlp("Hello bro, i live in Indonesia and i love lasagna. Also, i love burgers and pizzas. Man, i wish i could get some pizza right now. I worked at itb")

## Sentence Splitter

In [2]:
def sentence_split(text):
    return list(text.sents)
sentence_split(text)

[Hello bro, i live in Indonesia and i love lasagna.,
 Also, i love burgers and pizzas.,
 Man, i wish i could get some pizza right now.,
 I worked at itb]

## Tokenization

In [3]:
def tokenization(text):
    return [token.text for token in text]
tokenization(text)

['Hello',
 'bro',
 ',',
 'i',
 'live',
 'in',
 'Indonesia',
 'and',
 'i',
 'love',
 'lasagna',
 '.',
 'Also',
 ',',
 'i',
 'love',
 'burgers',
 'and',
 'pizzas',
 '.',
 'Man',
 ',',
 'i',
 'wish',
 'i',
 'could',
 'get',
 'some',
 'pizza',
 'right',
 'now',
 '.',
 'I',
 'worked',
 'at',
 'itb']

## Stemming

In [4]:
def stemming(token):
    prefix = token.prefix_
    suffix = token.suffix_
    original = token.text
    #stem prefix
    stemmed = original[len(prefix)-1:len(original)]
    #stem suffix
    stemmed = stemmed[0:(len(stemmed)-1) - (len(suffix)-1)]
    return stemmed
tes = nlp("writing")
stemming(tes[0])

'writ'

## Lemmatization

In [5]:
def lemmatization(token):
    return token.lemma_
lemmatization(tes[0])

'write'

## Entity Masking

In [6]:
def mask_entity(text):
    final_text = ' '.join(token.text if token.ent_type_ == "" else "MASK" for token in text)
    return final_text
mask_entity(text)

'Hello bro , i live in MASK and i love lasagna . Also , i love burgers and pizzas . Man , i wish i could get some pizza right now . I worked at MASK'

## Entity Masking Custom Word

In [7]:
from enum import Enum
from spacy.tokens import Span
EntityType = Enum('EntityType',['Person', 'Norp', 'Fac', 'Org', 'Gpe', 'Loc', 'Product', 'Event', 'Work_of_art', 'Law', 'Language','Date', 'Time', 'Percent', 'Money', 'Quantity', 'Ordinal', 'Cardinal'])

def get_token_index(text, word):
    idx_tokens = []
    idx_begin = 0
    for token in text:
        if token.text == word:
            idx_end = idx_begin+1
            idx_tokens.append([idx_begin, idx_end])
        idx_begin+=1
    return idx_tokens

def filter_unique_entities(new_ents, original_ents):
    unique_ents = []
    for ent in new_ents:
        if (ent not in original_ents):
            unique_ents.append(ent)
    return unique_ents

def create_new_entity(text, entity_word, entity_type: EntityType):
    idx_tokens = get_token_index(text, entity_word)
    if (len(idx_tokens) == 0):
        raise Exception('Custom word to be defined as entity not found in text')
    new_entities = []
    org_ents = list(text.ents)
    for it in idx_tokens:
        new_entity = Span(text, it[0], it[1], label=entity_type.name.upper())
        new_entities.append(new_entity)
    new_entities = filter_unique_entities(new_entities, org_ents)
    text.ents = org_ents + new_entities

def mask_entity_custom(text, entity_word, entity_type: EntityType):
    for ent in entity_word:
        create_new_entity(text, ent, entity_type)
    return mask_entity(text)

mask_entity_custom(text, ["bro"], EntityType.Law)


'Hello MASK , i live in MASK and i love lasagna . Also , i love burgers and pizzas . Man , i wish i could get some pizza right now . I worked at MASK'

## POS Tagger

In [8]:
def pos_tagger(text):
    data = []
    for token in text:
        token_data = {
            'text' : token.text,
            'lemma' : token.lemma_,
            'pos' : token.pos_,
            'tag' : token.tag_,
            'dep' : token.dep_,
            'shape' : token.shape_,
            'is_alpha' : token.is_alpha,
            'is_stop' : token.is_stop
        }
        data.append(token_data)
    return data
pos_tagger(text)

[{'text': 'Hello',
  'lemma': 'hello',
  'pos': 'INTJ',
  'tag': 'UH',
  'dep': 'intj',
  'shape': 'Xxxxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': 'bro',
  'lemma': 'bro',
  'pos': 'NOUN',
  'tag': 'NN',
  'dep': 'npadvmod',
  'shape': 'xxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': ',',
  'lemma': ',',
  'pos': 'PUNCT',
  'tag': ',',
  'dep': 'punct',
  'shape': ',',
  'is_alpha': False,
  'is_stop': False},
 {'text': 'i',
  'lemma': 'I',
  'pos': 'PRON',
  'tag': 'PRP',
  'dep': 'nsubj',
  'shape': 'x',
  'is_alpha': True,
  'is_stop': True},
 {'text': 'live',
  'lemma': 'live',
  'pos': 'VERB',
  'tag': 'VBP',
  'dep': 'ROOT',
  'shape': 'xxxx',
  'is_alpha': True,
  'is_stop': False},
 {'text': 'in',
  'lemma': 'in',
  'pos': 'ADP',
  'tag': 'IN',
  'dep': 'prep',
  'shape': 'xx',
  'is_alpha': True,
  'is_stop': True},
 {'text': 'Indonesia',
  'lemma': 'Indonesia',
  'pos': 'PROPN',
  'tag': 'NNP',
  'dep': 'pobj',
  'shape': 'Xxxxx',
  'is_alpha': True,
  'i

## Phrase Chunking

In [9]:
def noun_phrase_chunk(text):
    return [chunk for chunk in text.noun_chunks]
noun_phrase_chunk(text)

[i, Indonesia, i, lasagna, i, burgers, pizzas, i, i, some pizza, I, itb]