## Preprocessing 

In [1]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords

### 1. Read in csv file and create Dataframe & check shape.

In [2]:
str_data = """<html><h2>What is nlp??? </h2></html> 
Natural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.
The study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.
(In this post), you will discover what natural language processing is and why it is so important.
After reading this post, you will know => What natural language is and how it is different from other types of data."""
str_data

'<html><h2>What is nlp??? </h2></html> \nNatural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.\nThe study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.\n(In this post), you will discover what natural language processing is and why it is so important.\nAfter reading this post, you will know => What natural language is and how it is different from other types of data.'

### 2-1. Cleaning - Remove HTML

In [3]:
def remove_html(text_data):
    """
    WRITE THE CODE
    """

    soup = BeautifulSoup(text_data, 'lxml')
    return soup.get_text()

processed_text = remove_html(str_data)
print(processed_text)

What is nlp???  
Natural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.
The study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.
(In this post), you will discover what natural language processing is and why it is so important.
After reading this post, you will know => What natural language is and how it is different from other types of data.


### 2-2. Cleaning - Remove punctuation(구두점) & Lower case

In [4]:
## Check English's punctuation
print('Punctuation: ', string.punctuation)

Punctuation:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
def remove_punctuation(text):
    sent = []
    for t in text.split(' '):
        no_punct = "".join([c for c in t if c not in string.punctuation ])
        sent.append(no_punct)
    
    sentence = " ".join(s for s in sent)
    return sentence

In [6]:
rmv_punc_sentence = remove_punctuation(processed_text)
print(rmv_punc_sentence)

What is nlp  
Natural Language Processing or NLP for short is broadly defined as the automatic manipulation of natural language like speech and text by software
The study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers
In this post you will discover what natural language processing is and why it is so important
After reading this post you will know  What natural language is and how it is different from other types of data


In [7]:
lower_sentence = rmv_punc_sentence.lower()
print(lower_sentence)

what is nlp  
natural language processing or nlp for short is broadly defined as the automatic manipulation of natural language like speech and text by software
the study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers
in this post you will discover what natural language processing is and why it is so important
after reading this post you will know  what natural language is and how it is different from other types of data


### 3. Lemmatization & Tokenization with spacy library

In [8]:
## using "spacy" library
import spacy

## Load the installed model "en_core_web_sm" into "nlp"
nlp = spacy.load('en_core_web_sm')

In [9]:
## 'doc' is a sequence of Token objects 
## it holds all information about the tokens, their linguistic features and their relationships.
doc = nlp(lower_sentence.strip())

In [11]:
tok_lem_sentence = [(token.text, token.lemma_) for token in doc ]
tok_lem_sentence[:15]

[('what', 'what'),
 ('is', 'be'),
 ('nlp', 'nlp'),
 (' \n', ' \n'),
 ('natural', 'natural'),
 ('language', 'language'),
 ('processing', 'processing'),
 ('or', 'or'),
 ('nlp', 'nlp'),
 ('for', 'for'),
 ('short', 'short'),
 ('is', 'be'),
 ('broadly', 'broadly'),
 ('defined', 'define'),
 ('as', 'as')]

In [12]:
tok_lem_sentence = [token.lemma_ for token in doc]
tok_lem_sentence[:15]

['what',
 'be',
 'nlp',
 ' \n',
 'natural',
 'language',
 'processing',
 'or',
 'nlp',
 'for',
 'short',
 'be',
 'broadly',
 'define',
 'as']

### 4. Remove stop words(불용어: 큰 의미가 없는 단어)

In [13]:
# if you do not have 'stopwords' then run the below statement.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/chu-
[nltk_data]     ingyu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
from nltk.corpus import stopwords

In [17]:
stop_words = set(stopwords.words('english'))

print(tok_lem_sentence, '\n')
rmv_sw_sentence = [w for w in tok_lem_sentence if not w in stop_words]
print(rmv_sw_sentence)
removed_word = [word for word in tok_lem_sentence if not word in rmv_sw_sentence]
print("\nRemoved word: ", set(removed_word))

['what', 'be', 'nlp', ' \n', 'natural', 'language', 'processing', 'or', 'nlp', 'for', 'short', 'be', 'broadly', 'define', 'as', 'the', 'automatic', 'manipulation', 'of', 'natural', 'language', 'like', 'speech', 'and', 'text', 'by', 'software', '\n', 'the', 'study', 'of', 'natural', 'language', 'processing', 'have', 'be', 'around', 'for', 'more', 'than', '50', 'year', 'and', 'grow', 'out', 'of', 'the', 'field', 'of', 'linguistic', 'with', 'the', 'rise', 'of', 'computer', '\n', 'in', 'this', 'post', 'you', 'will', 'discover', 'what', 'natural', 'language', 'processing', 'be', 'and', 'why', 'it', 'be', 'so', 'important', '\n', 'after', 'read', 'this', 'post', 'you', 'will', 'know', ' ', 'what', 'natural', 'language', 'be', 'and', 'how', 'it', 'be', 'different', 'from', 'other', 'type', 'of', 'datum'] 

['nlp', ' \n', 'natural', 'language', 'processing', 'nlp', 'short', 'broadly', 'define', 'automatic', 'manipulation', 'natural', 'language', 'like', 'speech', 'text', 'software', '\n', 'stu

### 5. Make dictionary 

In [None]:
import numpy as np

dictionary = {}

def make_frequency_dict(text):
    """
    WRITE THE CODE
    """
            
make_frequency_dict(rmv_sw_sentence)

In [None]:
len(dictionary)

32

In [None]:
dictionary

In [None]:
vocab_sorted = sorted(dictionary.items(), key=lambda x:x[1], reverse = True)
vocab_sorted

In [None]:
word_to_index = {}
i = 0

for (word, frequency) in vocab_sorted :
    """
    WRITE THE CODE
    """
    
print(word_to_index)

In [None]:
"""
WRITE THE CODE
"""

print(word_to_index)

### 6. Encoding

In [None]:
encoded = []

print(rmv_sw_sentence)

for w in rmv_sw_sentence:
    """
    WRITE THE CODE
    """
print(encoded)

## THE END 🌟