# Natural Language Processing

In [1]:
text = "The design thinking process should not be seen as a concrete and inflexible approach to design; the component stages identified should serve as a guide to the activities you carry out. The stages might be switched, conducted concurrently or repeated several times to gain the most informative insights about your users, expand the solution space and hone in on innovative solutions. This is one of the main benefits of the five-stage model. Knowledge acquired in the latter stages of the process can inform repeats of earlier stages. Information is continually used to inform the understanding of the problem and solution spaces, and to redefine the problem itself. This creates a perpetual loop, in which the designers continue to gain new insights, develop new ways to view the product (or service) and its possible uses and develop a far more profound understanding of their real users and the problems they face."
text

'The design thinking process should not be seen as a concrete and inflexible approach to design; the component stages identified should serve as a guide to the activities you carry out. The stages might be switched, conducted concurrently or repeated several times to gain the most informative insights about your users, expand the solution space and hone in on innovative solutions. This is one of the main benefits of the five-stage model. Knowledge acquired in the latter stages of the process can inform repeats of earlier stages. Information is continually used to inform the understanding of the problem and solution spaces, and to redefine the problem itself. This creates a perpetual loop, in which the designers continue to gain new insights, develop new ways to view the product (or service) and its possible uses and develop a far more profound understanding of their real users and the problems they face.'

## Segmentation

In [4]:
# import
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/dell/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Split text into sentences
sentences = sent_tokenize(text)
sentences

['The design thinking process should not be seen as a concrete and inflexible approach to design; the component stages identified should serve as a guide to the activities you carry out.',
 'The stages might be switched, conducted concurrently or repeated several times to gain the most informative insights about your users, expand the solution space and hone in on innovative solutions.',
 'This is one of the main benefits of the five-stage model.',
 'Knowledge acquired in the latter stages of the process can inform repeats of earlier stages.',
 'Information is continually used to inform the understanding of the problem and solution spaces, and to redefine the problem itself.',
 'This creates a perpetual loop, in which the designers continue to gain new insights, develop new ways to view the product (or service) and its possible uses and develop a far more profound understanding of their real users and the problems they face.']

In [8]:
# Punctuation removal
import re

# Remove punctuation characters
text = []
for item in sentences:
    sentence= re.sub(r"[^a-zA-Z0-9]", " ", item)
    text.append(sentence)

text

['The design thinking process should not be seen as a concrete and inflexible approach to design  the component stages identified should serve as a guide to the activities you carry out ',
 'The stages might be switched  conducted concurrently or repeated several times to gain the most informative insights about your users  expand the solution space and hone in on innovative solutions ',
 'This is one of the main benefits of the five stage model ',
 'Knowledge acquired in the latter stages of the process can inform repeats of earlier stages ',
 'Information is continually used to inform the understanding of the problem and solution spaces  and to redefine the problem itself ',
 'This creates a perpetual loop  in which the designers continue to gain new insights  develop new ways to view the product  or service  and its possible uses and develop a far more profound understanding of their real users and the problems they face ']

## Tokenization

In [9]:
from nltk.tokenize import word_tokenize

In [17]:
tokens = []
for item in text:
    words = word_tokenize(item)
    tokens += words

print(tokens)

['The', 'design', 'thinking', 'process', 'should', 'not', 'be', 'seen', 'as', 'a', 'concrete', 'and', 'inflexible', 'approach', 'to', 'design', 'the', 'component', 'stages', 'identified', 'should', 'serve', 'as', 'a', 'guide', 'to', 'the', 'activities', 'you', 'carry', 'out', 'The', 'stages', 'might', 'be', 'switched', 'conducted', 'concurrently', 'or', 'repeated', 'several', 'times', 'to', 'gain', 'the', 'most', 'informative', 'insights', 'about', 'your', 'users', 'expand', 'the', 'solution', 'space', 'and', 'hone', 'in', 'on', 'innovative', 'solutions', 'This', 'is', 'one', 'of', 'the', 'main', 'benefits', 'of', 'the', 'five', 'stage', 'model', 'Knowledge', 'acquired', 'in', 'the', 'latter', 'stages', 'of', 'the', 'process', 'can', 'inform', 'repeats', 'of', 'earlier', 'stages', 'Information', 'is', 'continually', 'used', 'to', 'inform', 'the', 'understanding', 'of', 'the', 'problem', 'and', 'solution', 'spaces', 'and', 'to', 'redefine', 'the', 'problem', 'itself', 'This', 'creates',

## Removal of Stop Words

In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/dell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Remove stop words
words = [w for w in tokens if w not in stopwords.words("english")]
print(words)

['The', 'design', 'thinking', 'process', 'seen', 'concrete', 'inflexible', 'approach', 'design', 'component', 'stages', 'identified', 'serve', 'guide', 'activities', 'carry', 'The', 'stages', 'might', 'switched', 'conducted', 'concurrently', 'repeated', 'several', 'times', 'gain', 'informative', 'insights', 'users', 'expand', 'solution', 'space', 'hone', 'innovative', 'solutions', 'This', 'one', 'main', 'benefits', 'five', 'stage', 'model', 'Knowledge', 'acquired', 'latter', 'stages', 'process', 'inform', 'repeats', 'earlier', 'stages', 'Information', 'continually', 'used', 'inform', 'understanding', 'problem', 'solution', 'spaces', 'redefine', 'problem', 'This', 'creates', 'perpetual', 'loop', 'designers', 'continue', 'gain', 'new', 'insights', 'develop', 'new', 'ways', 'view', 'product', 'service', 'possible', 'uses', 'develop', 'far', 'profound', 'understanding', 'real', 'users', 'problems', 'face']


## Stemming and Lemmatization

In [19]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/dell/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/dell/nltk_data...


True

In [20]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['the', 'design', 'think', 'process', 'seen', 'concret', 'inflex', 'approach', 'design', 'compon', 'stage', 'identifi', 'serv', 'guid', 'activ', 'carri', 'the', 'stage', 'might', 'switch', 'conduct', 'concurr', 'repeat', 'sever', 'time', 'gain', 'inform', 'insight', 'user', 'expand', 'solut', 'space', 'hone', 'innov', 'solut', 'thi', 'one', 'main', 'benefit', 'five', 'stage', 'model', 'knowledg', 'acquir', 'latter', 'stage', 'process', 'inform', 'repeat', 'earlier', 'stage', 'inform', 'continu', 'use', 'inform', 'understand', 'problem', 'solut', 'space', 'redefin', 'problem', 'thi', 'creat', 'perpetu', 'loop', 'design', 'continu', 'gain', 'new', 'insight', 'develop', 'new', 'way', 'view', 'product', 'servic', 'possibl', 'use', 'develop', 'far', 'profound', 'understand', 'real', 'user', 'problem', 'face']


In [21]:
# Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

['The', 'design', 'thinking', 'process', 'seen', 'concrete', 'inflexible', 'approach', 'design', 'component', 'stage', 'identified', 'serve', 'guide', 'activity', 'carry', 'The', 'stage', 'might', 'switched', 'conducted', 'concurrently', 'repeated', 'several', 'time', 'gain', 'informative', 'insight', 'user', 'expand', 'solution', 'space', 'hone', 'innovative', 'solution', 'This', 'one', 'main', 'benefit', 'five', 'stage', 'model', 'Knowledge', 'acquired', 'latter', 'stage', 'process', 'inform', 'repeat', 'earlier', 'stage', 'Information', 'continually', 'used', 'inform', 'understanding', 'problem', 'solution', 'space', 'redefine', 'problem', 'This', 'creates', 'perpetual', 'loop', 'designer', 'continue', 'gain', 'new', 'insight', 'develop', 'new', 'way', 'view', 'product', 'service', 'possible', 'us', 'develop', 'far', 'profound', 'understanding', 'real', 'user', 'problem', 'face']


## Part of Speech Tagging

In [22]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dell/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/dell/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [23]:
from nltk import pos_tag

In [25]:
# Tag each word with part of speech
pos_tag(words)

[('The', 'DT'),
 ('design', 'NN'),
 ('thinking', 'VBG'),
 ('process', 'NN'),
 ('seen', 'VBN'),
 ('concrete', 'NN'),
 ('inflexible', 'JJ'),
 ('approach', 'NN'),
 ('design', 'NN'),
 ('component', 'JJ'),
 ('stages', 'NNS'),
 ('identified', 'VBN'),
 ('serve', 'VBP'),
 ('guide', 'JJ'),
 ('activities', 'NNS'),
 ('carry', 'VBP'),
 ('The', 'DT'),
 ('stages', 'NNS'),
 ('might', 'MD'),
 ('switched', 'VB'),
 ('conducted', 'VBN'),
 ('concurrently', 'RB'),
 ('repeated', 'VBN'),
 ('several', 'JJ'),
 ('times', 'NNS'),
 ('gain', 'VBP'),
 ('informative', 'JJ'),
 ('insights', 'NNS'),
 ('users', 'NNS'),
 ('expand', 'VBP'),
 ('solution', 'NN'),
 ('space', 'NN'),
 ('hone', 'NN'),
 ('innovative', 'JJ'),
 ('solutions', 'NNS'),
 ('This', 'DT'),
 ('one', 'CD'),
 ('main', 'JJ'),
 ('benefits', 'NNS'),
 ('five', 'CD'),
 ('stage', 'NN'),
 ('model', 'NN'),
 ('Knowledge', 'NNP'),
 ('acquired', 'VBD'),
 ('latter', 'JJ'),
 ('stages', 'NNS'),
 ('process', 'NN'),
 ('inform', 'NN'),
 ('repeats', 'VBZ'),
 ('earlier', 'JJR

## Named Entity Recognition

In [26]:
from nltk import ne_chunk
nltk.download('words')

[nltk_data] Downloading package words to /home/dell/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [28]:
ner_tree = ne_chunk(pos_tag(lemmatized))
print(ner_tree)

(S
  The/DT
  design/NN
  thinking/VBG
  process/NN
  seen/VBN
  concrete/NN
  inflexible/JJ
  approach/NN
  design/NN
  component/JJ
  stage/NN
  identified/VBD
  serve/JJ
  guide/NN
  activity/NN
  carry/VBP
  The/DT
  stage/NN
  might/MD
  switched/VB
  conducted/VBN
  concurrently/RB
  repeated/VBN
  several/JJ
  time/NN
  gain/NN
  informative/JJ
  insight/NN
  user/NN
  expand/NN
  solution/NN
  space/NN
  hone/NN
  innovative/JJ
  solution/NN
  This/DT
  one/CD
  main/JJ
  benefit/NN
  five/CD
  stage/NN
  model/NN
  (PERSON Knowledge/NNP)
  acquired/VBD
  latter/JJ
  stage/NN
  process/NN
  inform/NN
  repeat/NN
  earlier/RBR
  stage/NN
  Information/NNP
  continually/RB
  used/VBD
  inform/NN
  understanding/NN
  problem/NN
  solution/NN
  space/NN
  redefine/NN
  problem/NN
  This/DT
  creates/VBZ
  perpetual/JJ
  loop/NN
  designer/NN
  continue/VBP
  gain/VB
  new/JJ
  insight/JJ
  develop/VB
  new/JJ
  way/NN
  view/NN
  product/NN
  service/NN
  possible/JJ
  us/PRP
  dev