# There are three pre-processing steps before feeding the speech/sentences in the algorithom for the training and testing

In [41]:
# nltk.download() # run thi line if you don't have nltk
import nltk
import re
from nltk.stem import PorterStemmer # need for stemm process
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # need for lemmatization process


In [42]:
paragraph =  """Human beings have an innate inner drive to be autonomous, self-determined and connected to one another.
                And, when that drive is liberated, people achieve more and live richer lives. 
                The stars will never align, and the traffic lights of life will never all be green at the same time. 
                The universe doesn't conspire against you, but it doesn't go out of its way to line up the pins either. 
                Conditions are never perfect.
                'Someday' is a disease that will take your dreams to the grave with you. 
                Pro and con lists are just as bad. 
                If it's important to you and you want to do it 'eventually,' just do it and correct course along the way."""

# Tokenization (Step-1)

In [43]:
# Tokenizing sentences
sentences = nltk.sent_tokenize(paragraph)
print(sentences) # to see the output

['Human beings have an innate inner drive to be autonomous, self-determined and connected to one another.', 'And, when that drive is liberated, people achieve more and live richer lives.', 'The stars will never align, and the traffic lights of life will never all be green at the same time.', "The universe doesn't conspire against you, but it doesn't go out of its way to line up the pins either.", 'Conditions are never perfect.', "'Someday' is a disease that will take your dreams to the grave with you.", 'Pro and con lists are just as bad.', "If it's important to you and you want to do it 'eventually,' just do it and correct course along the way."]


In [44]:
# Tokenizing words
words = nltk.word_tokenize(paragraph)
print(words) # to see the output

['Human', 'beings', 'have', 'an', 'innate', 'inner', 'drive', 'to', 'be', 'autonomous', ',', 'self-determined', 'and', 'connected', 'to', 'one', 'another', '.', 'And', ',', 'when', 'that', 'drive', 'is', 'liberated', ',', 'people', 'achieve', 'more', 'and', 'live', 'richer', 'lives', '.', 'The', 'stars', 'will', 'never', 'align', ',', 'and', 'the', 'traffic', 'lights', 'of', 'life', 'will', 'never', 'all', 'be', 'green', 'at', 'the', 'same', 'time', '.', 'The', 'universe', 'does', "n't", 'conspire', 'against', 'you', ',', 'but', 'it', 'does', "n't", 'go', 'out', 'of', 'its', 'way', 'to', 'line', 'up', 'the', 'pins', 'either', '.', 'Conditions', 'are', 'never', 'perfect', '.', "'Someday", "'", 'is', 'a', 'disease', 'that', 'will', 'take', 'your', 'dreams', 'to', 'the', 'grave', 'with', 'you', '.', 'Pro', 'and', 'con', 'lists', 'are', 'just', 'as', 'bad', '.', 'If', 'it', "'s", 'important', 'to', 'you', 'and', 'you', 'want', 'to', 'do', 'it', "'eventually", ',', "'", 'just', 'do', 'it', 

# Stemming or Lemmatization (Step-2)

In [62]:
#Stemming process to find out the root word 

stemmer = PorterStemmer()

corpus = [] # after cleanning the sentences we will put all the sentances into the courpus variable.
            #So that in future we can compare our real paragraph and the after clean paragraph or sentance
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i]) # removing all the punctuation except letters by space 
    review = review.lower() # lower case all the letter
    review = review.split() # So the same as the nltk.word_tokenize() do
    #review = nltk.word_tokenize(review)# do the same as the .splite() do
    #print(review)
    review = [stemmer.stem(word) for word in review if not word in set(stopwords.words('english'))] #Stemming
    review = ' '.join(review)
    print(review)
    corpus.append(review)
    
    

# observe the stemming output result very carefully.most of the words don't have any proper meaning.
#this is a problem of stemming. This problem can be solve if we use Lemmatization process instead of stemming.
#Though stemming process has some good applicaion like spam mail detaction, stemming process take less time compare to Lemmatization

human innat inner drive autonom self determin connect one anoth
drive liber peopl achiev live richer life
star never align traffic light life never green time
univers n conspir n go way line pin either
condit never perfect
someday diseas take dream grave
pro con list bad
import want eventu correct cours along way


In [65]:
#Lemmatization process to find out the root word 

lemmatizer = WordNetLemmatizer()

corpus = [] # after cleanning the sentences we will put all the sentances into the courpus variable.
            #So that in future we can compare our real paragraph and the after clean paragraph or sentance
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i]) # removing all the punctuation except letters by space 
    review = review.lower()
    review = review.split() # So the same as the nltk.word_tokenize() do
    #review = nltk.word_tokenize(review)# do the same as the .splite() do
    #print(review)
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))] # Lemmatization
    review = ' '.join(review)
    print(review)
    corpus.append(review)
    
# observe the lemmatizer output result very carefully.most of the words have proper meaning.
#try to compare the two process output 
#Lemmatization process has some good applicaion like google home,alexa,amazon echo,siri in word to process and understand the language
#but it takes longer time than stemming

human innate inner drive autonomous self determined connected one another
drive liberated people achieve live richer life
star never align traffic light life never green time
universe n conspire n go way line pin either
condition never perfect
someday disease take dream grave
pro con list bad
important want eventually correct course along way


# Bag of Words or TF-IDF(Step-3)

In [66]:
# Creating the Bag of Words model. Bag of words is nothing but, it creates the matrix for the paragraph 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
# Bag of words has a draw back that is, In the Bag of Words model give same importance to all the words.
#To overcome this draw back TF-TDF model is used

In [67]:
# Creating the TF-IDF model.TF-IDF  is nothing but, it creates the matrix for the paragraph 
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()