<a href="https://colab.research.google.com/github/GouthamVicky/TextGeneration/blob/main/textpreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
Example_Sentence = "Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data, known as “training data”, in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning and focuses on exploratory data analysis through unsupervised learning. In its application across business problems, machine learning is also referred to as predictive analytics."

In [None]:
#spaCy Code Initialization:
import spacy
nlp = spacy.load('en_core_web_sm')

## Text Preprocessing using Spacy

In [None]:
def spacy_process(text):
    doc = nlp(text)
    
    #Tokenization and lemmatization are done with the spacy nlp pipeline commands
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;()@#$%^&*}{"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    print(" ")
    print("Remove stopword & punctuation: ")
    print(filtered_sentence)

In [None]:
spacy_cleaned_text=spacy_process(Example_Sentence)

Tokenize+Lemmatize:
['machine', 'learning', '(', 'ML', ')', 'be', 'the', 'scientific', 'study', 'of', 'algorithm', 'and', 'statistical', 'model', 'that', 'computer', 'system', 'use', 'to', 'progressively', 'improve', 'their', 'performance', 'on', 'a', 'specific', 'task', '.', 'machine', 'learning', 'algorithm', 'build', 'a', 'mathematical', 'model', 'of', 'sample', 'datum', ',', 'know', 'as', '"', 'training', 'datum', '"', ',', 'in', 'order', 'to', 'make', 'prediction', 'or', 'decision', 'without', 'be', 'explicitly', 'program', 'to', 'perform', 'the', 'task', '.', 'machine', 'learning', 'algorithm', 'be', 'use', 'in', 'the', 'application', 'of', 'email', 'filtering', ',', 'detection', 'of', 'network', 'intruder', ',', 'and', 'computer', 'vision', ',', 'where', 'it', 'be', 'infeasible', 'to', 'develop', 'an', 'algorithm', 'of', 'specific', 'instruction', 'for', 'perform', 'the', 'task', '.', 'machine', 'learning', 'be', 'closely', 'relate', 'to', 'computational', 'statistic', ',', 'whi

## Text Preprocesing using NLTK

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

p_stemmer = PorterStemmer()
#s_stemmer = SnowballStemmer(language='english')

def nltk_process(text):
    #Tokenization
    nltk_tokenList = word_tokenize(text)
    
    #Stemming
    nltk_stemedList = []
    for word in nltk_tokenList:
        nltk_stemedList.append(p_stemmer.stem(word))
        #nltk_stemedList.append(s_stemmer.stem(word))
    
    #Lemmatization
    wordnet_lemmatizer = WordNetLemmatizer()
    nltk_lemmaList = []
    for word in nltk_stemedList:
        nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))
    
    print("Stemming + Lemmatization")
    print(nltk_lemmaList)

    #Filter stopword
    filtered_sentence = []  
    nltk_stop_words = set(stopwords.words("english"))
    for w in nltk_lemmaList:  
        if w not in nltk_stop_words:  
            filtered_sentence.append(w)  

    #Removing Punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    print(" ")
    print("Remove stopword & Punctuation")
    print(filtered_sentence)
nltk_cleaned_text=nltk_process(Example_Sentence)

Stemming + Lemmatization
['machin', 'learn', '(', 'ml', ')', 'is', 'the', 'scientif', 'studi', 'of', 'algorithm', 'and', 'statist', 'model', 'that', 'comput', 'system', 'use', 'to', 'progress', 'improv', 'their', 'perform', 'on', 'a', 'specif', 'task', '.', 'machin', 'learn', 'algorithm', 'build', 'a', 'mathemat', 'model', 'of', 'sampl', 'data', ',', 'known', 'a', '“', 'train', 'data', '”', ',', 'in', 'order', 'to', 'make', 'predict', 'or', 'decis', 'without', 'be', 'explicitli', 'program', 'to', 'perform', 'the', 'task', '.', 'machin', 'learn', 'algorithm', 'are', 'use', 'in', 'the', 'applic', 'of', 'email', 'filter', ',', 'detect', 'of', 'network', 'intrud', ',', 'and', 'comput', 'vision', ',', 'where', 'it', 'is', 'infeas', 'to', 'develop', 'an', 'algorithm', 'of', 'specif', 'instruct', 'for', 'perform', 'the', 'task', '.', 'machin', 'learn', 'is', 'close', 'relat', 'to', 'comput', 'statist', ',', 'which', 'focus', 'on', 'make', 'predict', 'use', 'comput', '.', 'the', 'studi', 'of',