#Spacy



In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 70 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


#Explacy.py

In [None]:
# -*- coding: utf-8 -*-
#
# Box-drawing characters are the thin variants, and can be found here:
# https://en.wikipedia.org/wiki/Box-drawing_character
#
""" explacy.py

    This module uses unicode box-drawing characters to draw the spacy-derived
    dependency tree of whichever (unicode) string you provide as input.

    Usage:

        import explacy
        import spacy

        nlp = spacy.load('en')

        explacy.print_parse_info(nlp, 'The salad was surprisingly tasty.')

        # Use a unicode string as input (eg u'The dog jumped.') in Python 2.

    Example tree rendering:

        Dep tree Token        Dep type Lemma        Part of Sp
        ──────── ──────────── ──────── ──────────── ──────────
            ┌─►  The          det      the          DET
         ┌─►└──  salad        nsubj    salad        NOUN
        ┌┼─────  was          ROOT     be           VERB
        ││  ┌─►  surprisingly advmod   surprisingly ADV
        │└─►└──  tasty        acomp    tasty        ADJ
        └─────►  .            punct    .            PUNCT
"""

import sys
from collections import defaultdict

from pprint import pprint

_do_print_debug_info = False

def _print_table(rows):
    col_widths = [max(len(s) for s in col) for col in zip(*rows)]
    fmt = ' '.join('%%-%ds' % width for width in col_widths)
    rows.insert(1, ['─' * width for width in col_widths])
    for row in rows:
        # Uncomment this version to see code points printed out (for debugging).
        # print(list(map(hex, map(ord, list(fmt % tuple(row))))))
        print(fmt % tuple(row))

def _start_end(arrow):
    start, end = arrow['from'].i, arrow['to'].i
    mn = min(start, end)
    mx = max(start, end)
    return start, end, mn, mx

def print_parse_info(nlp, sent):
    """ Print the dependency tree of `sent` (sentence), along with the lemmas
        (de-inflected forms) and parts-of-speech of the words.

        The input `sent` is expected to be a unicode string (of type unicode in
        Python 2; of type str in Python 3). The input `nlp` (for natural
        language parser) is expected to be the return value from a call to
        spacy.load(), in other words, it's the callable instance of a spacy
        language model.
    """

    unicode_type = unicode if sys.version_info[0] < 3 else str
    assert type(sent) is unicode_type

    # Parse our sentence.
    doc = nlp(sent)

    # Build a list of arrow heights (distance from tokens) per token.
    heights = [[] for token in doc]

    # Build the arrows.

    # Set the from and to tokens for each arrow.
    arrows = [{'from': src, 'to': dst, 'underset': set()}
              for src in doc
              for dst in src.children]

    # Set the base height; these may increase to allow room for arrowheads after this.
    arrows_with_deps = defaultdict(set)
    for i, arrow in enumerate(arrows):
        if _do_print_debug_info:
            print('Arrow %d: "%s" -> "%s"' % (i, arrow['from'], arrow['to']))
        num_deps = 0
        start, end, mn, mx = _start_end(arrow)
        for j, other in enumerate(arrows):
            if arrow is other:
                continue
            o_start, o_end, o_mn, o_mx = _start_end(other)
            if ((start == o_start and mn <= o_end <= mx) or
                (start != o_start and mn <= o_start <= mx)):
                num_deps += 1
                if _do_print_debug_info:
                    print('%d is over %d' % (i, j))
                arrow['underset'].add(j)
        arrow['num_deps_left'] = arrow['num_deps'] = num_deps
        arrows_with_deps[num_deps].add(i)

    if _do_print_debug_info:
        print('')
        print('arrows:')
        pprint(arrows)

        print('')
        print('arrows_with_deps:')
        pprint(arrows_with_deps)

    # Render the arrows in characters. Some heights will be raised to make room for arrowheads.

    lines = [[] for token in doc]
    num_arrows_left = len(arrows)
    while num_arrows_left > 0:

        assert len(arrows_with_deps[0])

        arrow_index = arrows_with_deps[0].pop()
        arrow = arrows[arrow_index]
        src, dst, mn, mx = _start_end(arrow)

        # Check the height needed.
        height = 3
        if arrow['underset']:
            height = max(arrows[i]['height'] for i in arrow['underset']) + 1
        height = max(height, 3, len(lines[dst]) + 3)
        arrow['height'] = height

        if _do_print_debug_info:
            print('')
            print('Rendering arrow %d: "%s" -> "%s"' % (arrow_index,
                                                        arrow['from'],
                                                        arrow['to']))
            print('  height = %d' % height)

        goes_up = src > dst

        # Draw the outgoing src line.
        if lines[src] and len(lines[src]) < height:
            lines[src][-1].add('w')
        while len(lines[src]) < height - 1:
            lines[src].append(set(['e', 'w']))
        if len(lines[src]) < height:
            lines[src].append({'e'})
        lines[src][height - 1].add('n' if goes_up else 's')

        # Draw the incoming dst line.
        lines[dst].append(u'►')
        while len(lines[dst]) < height:
            lines[dst].append(set(['e', 'w']))
        lines[dst][-1] = set(['e', 's']) if goes_up else set(['e', 'n'])

        # Draw the adjoining vertical line.
        for i in range(mn + 1, mx):
            while len(lines[i]) < height - 1:
                lines[i].append(' ')
            lines[i].append(set(['n', 's']))

        # Update arrows_with_deps.
        for arr_i, arr in enumerate(arrows):
            if arrow_index in arr['underset']:
                arrows_with_deps[arr['num_deps_left']].remove(arr_i)
                arr['num_deps_left'] -= 1
                arrows_with_deps[arr['num_deps_left']].add(arr_i)

        num_arrows_left -= 1

    arr_chars = {'ew'  : u'─',
                 'ns'  : u'│',
                 'en'  : u'└',
                 'es'  : u'┌',
                 'ens' : u'├',
                 'enw' : u'┴',
                 'ensw': u'┼',
                 'esw' : u'┬'}

    # Convert the character lists into strings.
    max_len = max(len(line) for line in lines)
    for i in range(len(lines)):
        lines[i] = [arr_chars[''.join(sorted(ch))] if type(ch) is set else ch
                    for ch in lines[i]]
        lines[i] = ''.join(reversed(lines[i]))
        lines[i] = ' ' * (max_len - len(lines[i])) + lines[i]

    # Compile full table to print out.
    rows = [['Dep tree', 'Token', 'Dep type', 'Lemma', 'Part of Sp']]
    for i, token in enumerate(doc):
        rows.append([lines[i], token, token.dep_, token.lemma_, token.pos_])
    _print_table(rows)


#Import Python Libraries

In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

import matplotlib.pyplot as plt
%matplotlib inline

#Performing POS tagging, in spaCy, is a cakewalk:

- POS tagging is :

 - the task of automatically assigning POS tags to all the words of a sentence. It is helpful in various downstream tasks in NLP, such as feature engineering, language understanding, and information extraction.

In [None]:
# Create an nlp object
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp("chandan learned rapidly because first training ")

In [None]:
# Iterate over the tokens
for token in doc:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.pos_)

chandan --> PROPN
learned --> VERB
rapidly --> ADV
because --> SCONJ
first --> ADJ
training --> NOUN


In [None]:
"""file_name = '/content/oneplus_comment_file.txt' 
introduction_file_text = open(file_name).read()
introduction_file_doc = nlp(introduction_file_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_file_doc])
"""

"file_name = '/content/oneplus_comment_file.txt' \nintroduction_file_text = open(file_name).read()\nintroduction_file_doc = nlp(introduction_file_text)\n# Extract tokens for the given doc\nprint ([token.text for token in introduction_file_doc])\n"

In [None]:
file_name = '/content/oneplus.txt'
doc = open(file_name).read()
about_doc = nlp(doc)
sentences = list(about_doc.sents)
len(sentences)

79

In [None]:
for sentence in sentences:
  print (sentence)

1.
Battery is worse , you cannot keep on carrying a charger along with you if you are going outdoor for full-day, even at standby mode also battery keeps on draining, pathetic battery life.

2.
I odont know about other one plus phone as this is my first one, so I will say a proper 10000rs cellphone has a better front camera as compared to this one.
Rear camera with 50mp is of low quality, you cannot get details of any pic if you zoom it after capturing.
2mp ultrawide camera is useless , low quality images.


Worst thing is it's battery won't last for even half day, it's good that it has fast charger BUT who carries charger with them all day, atleast One Plus could have increased the battery capacity.


One last thing, please STOP collaborating with OPPO to manufacture cellphones or else you will end up like Nokia in next two three years.


All in all, this is the worst cellphone ever manufactured by One Plus.
Even the side casing is of cheap quality plastic.



Camera doesn't Start whe

#1.Tokenization in spaCy


In [None]:
for token in about_doc:
  print (token, token.idx)


1 0
. 1
Battery 3
is 11
worse 14
, 20
you 22
can 26
not 29
keep 33
on 38
carrying 41
a 50
charger 52
along 60
with 66
you 71
if 75
you 78
are 82
going 86
outdoor 92
for 100
full 104
- 108
day 109
, 112
even 114
at 119
standby 122
mode 130
also 135
battery 140
keeps 148
on 154
draining 157
, 165
pathetic 167
battery 176
life 184
. 188

 189
2 190
. 191
I 193
odont 195
know 201
about 206
other 212
one 218
plus 222
phone 227
as 233
this 236
is 241
my 244
first 247
one 253
, 256
so 258
I 261
will 263
say 268
a 272
proper 274
10000rs 281
cellphone 289
has 299
a 303
better 305
front 312
camera 318
as 325
compared 328
to 337
this 340
one 345
. 348
Rear 350
camera 355
with 362
50mp 367
is 372
of 375
low 378
quality 382
, 389
you 391
can 395
not 398
get 402
details 406
of 414
any 417
pic 421
if 425
you 428
zoom 432
it 437
after 440
capturing 446
. 455
2mp 457
ultrawide 461
camera 471
is 478
useless 481
, 489
low 491
quality 495
images 503
. 509


 510
Worst 512
thing 518
is 524
it 527
's 529
ba

In [None]:
for token in about_doc:
  print (token, token.idx, token.text_with_ws,
         token.is_alpha, token.is_punct, token.is_space,
         token.shape_, token.is_stop)

1 0 1 False False False d False
. 1 .  False True False . False
Battery 3 Battery  True False False Xxxxx False
is 11 is  True False False xx True
worse 14 worse  True False False xxxx False
, 20 ,  False True False , False
you 22 you  True False False xxx True
can 26 can True False False xxx True
not 29 not  True False False xxx True
keep 33 keep  True False False xxxx True
on 38 on  True False False xx True
carrying 41 carrying  True False False xxxx False
a 50 a  True False False x True
charger 52 charger  True False False xxxx False
along 60 along  True False False xxxx True
with 66 with  True False False xxxx True
you 71 you  True False False xxx True
if 75 if  True False False xx True
you 78 you  True False False xxx True
are 82 are  True False False xxx True
going 86 going  True False False xxxx False
outdoor 92 outdoor  True False False xxxx False
for 100 for  True False False xxx True
full 104 full True False False xxxx True
- 108 - False True False - False
day 109 day True Fa

- text_with_ws prints token text with trailing space (if present).
- is_alpha detects if the token consists of alphabetic characters or not.
- is_punct detects if the token is a punctuation symbol or not.
- is_space detects if the token is a space or not.
- shape_ prints out the shape of the word.
- is_stop detects if the token is a stop word or not.

#2. Dependency Parsing using spaCy

Every sentence has a grammatical structure to it and with the help of dependency parsing, we can extract this structure. It can also be thought of as a directed graph, where nodes correspond to the words in the sentence and the edges between the nodes are the corresponding dependencies between the word.

In [None]:

import spacy 
nlp = spacy.load('en_core_web_sm')

# Create an nlp object
doc = nlp("He went to play basketball")
 
# Iterate over the tokens
for token in doc:
    # Print the token and its part-of-speech tag
    print(token.text, "-->", token.pos_)

He --> PRON
went --> VERB
to --> PART
play --> VERB
basketball --> NOUN


In [None]:
# dependency parsing
for token in doc:
    print(token.text, "-->", token.dep_)

He --> nsubj
went --> ROOT
to --> aux
play --> advcl
basketball --> dobj


The dependency tag ROOT denotes the main verb or action in the sentence. The other words are directly or indirectly connected to the ROOT word of the sentence. 

In [None]:
spacy.explain("nsubj"), spacy.explain("ROOT"), spacy.explain("aux"), spacy.explain("advcl"), spacy.explain("dobj")


('nominal subject',
 None,
 'auxiliary',
 'adverbial clause modifier',
 'direct object')

#3. Named Entity Recognition using spaCy

Entities are the words or groups of words that represent information about common things such as persons, locations, organizations, etc. These entities have proper names.

In [None]:
doc = nlp("Indians spent over $71 billion on clothes in 2018")
 
for ent in doc.ents:
    print(ent.text, ent.label_)

Indians NORP
over $71 billion MONEY
2018 DATE


In [None]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

#4. Rule-Based Matching using spaCy !!!!

Rule-based matching is a new addition to spaCy’s arsenal. With this spaCy matcher, you can find words and phrases in the text using user-defined rules.

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Import spaCy Matcher
from spacy.matcher import Matcher

# Initialize the matcher with the spaCy vocabulary
matcher = Matcher(nlp.vocab)

doc = nlp("Some people start their day with lemon water")

# Define rule
pattern = [{'TEXT': 'lemon'}, {'TEXT': 'water'}]

# Add rule
matcher.add('rule_1', None, pattern)

In [None]:
matches = matcher(doc)
matches

[(7604275899133490726, 6, 8)]

In [None]:

# Extract matched text
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

lemon water


#_________________________________________________________________________

#Natural Language Processing With Python's NLTK Package

In [None]:
!python -m pip install nltk==3.5

#Tokenizing

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
file_name = '/content/oneplus.txt'
doc = open(file_name).read()

Use sent_tokenize() to split up doc into sentences:

In [None]:
sent_tokenize(doc)

In [None]:
doc_quote = word_tokenize(doc)

In [None]:
doc_quote

#Filtering Stop Words

In [None]:
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
#an empty list to hold the words that make it past the filter:
filtered_list = []

an empty list, filtered_list, to hold all the words in words_in_quote that aren’t stop words. Now you can use stop_words to filter words_in_quote:

In [None]:
for word in doc_quote:
  if word.casefold() not in stop_words:
    filtered_list.append(word)

Alternatively, you could use a list comprehension to make a list of all the words in your text that aren’t stop words:

In [None]:
filtered_list

- Content words give you information about the topics covered in the text or the sentiment that the author has about those topics.

- Context words give you information about writing style. You can observe patterns in how authors use context words in order to quantify their writing style. Once you’ve quantified their writing style, you can analyze a text written by an unknown author to see how closely it follows a particular writing style so you can try to identify who the author is.



#Stemming

Stemming is a text processing task in which you reduce words to their root, which is the core part of a word. For example, the words “helping” and “helper” share the root “help.” Stemming allows you to zero in on the basic meaning of a word rather than all the details of how it’s being used. NLTK has more than one stemmer, but you’ll be using the Porter stemmer.

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
stemmer = PorterStemmer()

In [None]:
string_for_stemming = """
Wake up to reality! Nothing ever goes as planned in this world. 
The longer you live, the more you realize that in this reality only 
pain, 
suffering and futility exist. – Madara Uchiha """

eparate all the words in it:

In [None]:
words = word_tokenize(string_for_stemming)

a list of the stemmed versions of the words in words by using stemmer.stem() in a list comprehension:

In [None]:
stemmed_words = [stemmer.stem(word) for word in words]

In [None]:
stemmed_words

- Understemming 
 - happens when two related words should be reduced to the same stem but aren’t. This is a false negative.
- Overstemming 
 - happens when two unrelated words are reduced to the same stem even though they shouldn’t be. This is a false positive.

#Tagging Parts of Speech

Part of speech is a grammatical term that deals with the roles words play when you use them together in sentences. Tagging parts of speech, or POS tagging, is the task of labeling the words in your text according to their part of speech.

####NLTK uses the word determiner to refer to articles.

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
string_for_tagging = """
Wake up to reality! Nothing ever goes as planned in this world. 
The longer you live, the more you realize that in this reality only 
pain, 
suffering and futility exist. – Madara Uchiha """

Use word_tokenize to separate the words in that string and store them in a list:

In [None]:
words_in_sagan_quote = word_tokenize(string_for_tagging)

nltk.pos_tag() on your new list of words:



In [None]:
"""
LookupError: 
**********************************************************************
  Resource averaged_perceptron_tagger not found.
  Please use the NLTK Downloader to obtain the resource:

"""
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
import nltk
nltk.pos_tag(words_in_sagan_quote)

- the words in the quote are now in a separate tuple, with a tag that represents their part of speech. 

#How to get a list of tags and their meanings:

In [None]:
"""LookupError: 
**********************************************************************
  Resource tagsets not found.
  Please use the NLTK Downloader to obtain the resource:
"""
import nltk
nltk.download('tagsets')

In [None]:
nltk.help.upenn_tagset()

#What the POS tags mean, you can see that your tagging was fairly successful:

- 'pie' was tagged NN because it’s a singular noun.
- 'you' was tagged PRP because it’s a personal pronoun.
- 'invent' was tagged VB because it’s the base form of a verb.

#Lemmatizing

- Like stemming, lemmatizing reduces words to their core meaning, but it will give you a complete English word that makes sense on its own instead of just a fragment of a word like 'discoveri'.

#Note: A lemma is a word that represents a whole group of words, and that group of words is called a lexeme.

For example, if you were to look up the word “blending” in a dictionary, then you’d need to look at the entry for “blend,” but you would find “blending” listed in that entry.

In this example, “blend” is the lemma, and “blending” is part of the lexeme. So when you lemmatize a word, you are reducing it to its lemma.

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
#Create a lemmatizer to use:

lemmatizer = WordNetLemmatizer()

In [None]:
""" Resource wordnet not found.
  Please use the NLTK Downloader to obtain the resource:

  >>> import nltk
  >>> nltk.download('wordnet')
"""

import nltk
nltk.download('wordnet')
#Let’s start with lemmatizing a plural noun:

lemmatizer.lemmatize("scarves")

In [None]:
string_for_lemmatizing = "When a man learns to love, he must bear the risk of hatred and The friends of DeSoto love scarves."

In [None]:
#Now tokenize that string by word:
words = word_tokenize(string_for_lemmatizing)

In [None]:
words

Create a list containing all the words in words after they’ve been lemmatized:

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]


In [None]:
lemmatized_words

In [None]:
lemmatizer.lemmatize("hatred")

###Treated as an adjective by adding the parameter pos="a". 

In [None]:
lemmatizer.lemmatize("worst", pos="a")

In [None]:
lemmatizer.lemmatize("risk", pos="a")

#Chunking | Chinking !!! 

While tokenizing allows you to identify words and sentences, chunking allows you to identify phrases.

Note: A phrase is a word or group of words that works as a single unit to perform a grammatical function. Noun phrases are built around a noun.

Here are some examples:

- “A planet”
- “A tilting planet”
- “A swiftly tilting planet”

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
#lotr_quote = "If you don’t share someone’s pain, you can never understand them.” “War brings death."
lotr_quote = "It's a dangerous business, Frodo, going out your door." 

In [None]:
#Now tokenize that string by word:


words_in_lotr_quote = word_tokenize(lotr_quote)
words_in_lotr_quote

- tag those words by part of speech:

In [None]:
nltk.download("averaged_perceptron_tagger")
lotr_pos_tags = nltk.pos_tag(words_in_lotr_quote)
lotr_pos_tags

##In order to chunk, you first need to define a chunk grammar.

Note: A chunk grammar is a combination of rules on how sentences should be chunked. It often uses regular expressions, or regexes.

For this tutorial, you don’t need to know how regular expressions work, but they will definitely come in handy for you in the future if you want to process text.

Create a chunk grammar with one regular expression rule:



In [None]:
grammar = "NP: {<DT>?<JJ>*<NN>}"

NP stands for noun phrase. of Natural Language Processing with Python—Analyzing Text with the Natural Language Toolkit.

According to the rule you created, your chunks:

Start with an optional (?) determiner ('DT')
Can have any number (*) of adjectives (JJ)
End with a noun (<NN>)
Create a chunk parser with this grammar:

In [None]:
chunk_parser = nltk.RegexpParser(grammar)

In [None]:
tree = chunk_parser.parse(lotr_pos_tags)

##A visual representation of this tree: !!!!


In [None]:
"""import os
import matplotlib as mpl
if os.environ.get('DISPLAY','') == '':
    print('no display found. Using non-interactive Agg backend')
    mpl.use('Agg')
import matplotlib.pyplot as plt
"""

In [None]:
### CREATE VIRTUAL DISPLAY ###
!apt-get install -y xvfb # Install X Virtual Frame Buffer
import os
os.system('Xvfb :1 -screen 0 1600x1200x16  &')    # create virtual display with size 1600x1200 and 16 bit color. Color can be changed to 24 or 8
os.environ['DISPLAY']=':1.0'    # tell X clients to use our virtual DISPLAY :1.0.

In [None]:
%matplotlib inline


### INSTALL GHOSTSCRIPT (Required to display NLTK trees) ###


In [None]:
!apt install ghostscript python3-tk


In [None]:
#A visual representation of this tree:

#tree.draw()

#Using Named Entity Recognition (NER)  !!!!

Named entities are noun phrases that refer to specific locations, people, organizations, and so on. With named entity recognition, you can find the named entities in your texts and also determine what kind of named entity they are.

Here’s the list of named entity types from the NLTK book:

 nltk.ne_chunk() to recognize named entities. Let’s use lotr_pos_tags again to test it out:

In [None]:
nltk.download("maxent_ne_chunker")
nltk.download("words")
tree = nltk.ne_chunk(lotr_pos_tags)

In [None]:
quote = """The world isn’t perfect. But it’s there for us, doing the best it can….that’s what makes it so damn beautiful.
Fear is not evil. It tells you what your weakness is. And once you know your weakness, you can become stronger as well
To know sorrow is not terrifying. What is terrifying is to know you can’t go back to happiness you could have.
Knowing you’re different is only the beginning. If you accept these differences you’ll be able to get past them"""

###Now create a function to extract named entities:

In [None]:
def extract_ne(quote):
  words = word_tokenize(quote, language="english") # !!!!
  tags = nltk.pos_tag(words)
  tree = nltk.ne_chunk(tags, binary=True)
  return set(" ".join(i[0] for i in t)
      for t in tree
      if hasattr(t, "label") and t.label() == "NE"
     )

gather all named entities, with no repeats. In order to do that, you tokenize by word, apply part of speech tags to those words, and then extract named entities based on those tags. Because you included binary=True, the named entities you’ll get won’t be labeled more specifically. You’ll just know that they’re named entities.

In [None]:
extract_ne(quote) # !!!!!

#Getting Text to Analyze

 A group of texts is called a corpus. NLTK provides several corpora covering everything from novels hosted by Project Gutenberg to inaugural speeches by presidents of the United States.

In order to analyze texts in NLTK, you first need to import them. This requires nltk.download("book"), which is a pretty big download:

In [None]:
nltk.download("book")
from nltk.book import *

###Using a Concordance

A concordance, you can see each time a word is used, along with its immediate context. This can give you a peek into how a word is being used at the sentence level and what words are used with it.

Let’s see what these good people looking for love have to say! The personals corpus is called text8, so we’re going to call .concordance() on it with the parameter "man"

In [None]:
text8.concordance("man")

Interestingly, the last three of those fourteen matches have to do with seeking an honest man, specifically:

SEEKING HONEST MAN
Seeks 35 - 45 , honest man with good SOH & similar interests
genuine , caring , honest and normal man for fship , poss rship

In [None]:
text8.concordance("woman")

Dipping into a corpus with a concordance won’t give you the full picture, but it can still be interesting to take a peek and see if anything stands out.

#Making a Dispersion Plot

In [None]:
text8.dispersion_plot(
    ["woman", "lady", "girl", "gal", "man", "gentleman", "boy", "guy"]
    )

- "lady" was used a lot more than "woman" or "girl". There were no instances of "gal".
- "man" and "guy" were used a similar number of times and were more common than "gentleman" or "boy".

In [None]:
text2.dispersion_plot(["Allenham", "Whitwell", "Cleveland", "Combe"])


- Allenham is the home of Willoughby’s benefactress and comes up a lot when Marianne is first interested in him.
- Cleveland is a home that Marianne stays at after she goes to see Willoughby in London and things go wrong.

- Dispersion plots are just one type of visualization you can make for textual data. The next one you’ll take a look at is frequency distributions

#Making a Frequency Distribution

In [None]:
from nltk import FreqDist

In [None]:
frequency_distribution = FreqDist(text8)
print(frequency_distribution)

In [None]:
frequency_distribution.most_common(20)

 Create a list of all of the words in text8 that aren’t stop words:

In [None]:
 meaningful_words = [
                     word for word in text8 if word.casefold() not in stop_words
                     ]

Have a list of all of the words in your corpus that aren’t stop words, make a frequency distribution:

In [None]:
frequency_distribution = FreqDist(meaningful_words)

the 20 most common words:



In [None]:
frequency_distribution.most_common(20)

#Graph:

##frequency_distribution

In [None]:
frequency_distribution.plot(20, cumulative=True)

Some of the most common words are:

- 'lady'
- 'seeks'
- 'ship'
- 'relationship'
- 'fun'
- 'slim'
- 'build'
- 'smoker'
- '50'
- 'non'
- 'movies'
- 'good'
- 'honest'

#Finding Collocations

- Syntax tree
- Family tree
- Decision tree

To see pairs of words that come up often in your corpus, you need to call .collocations() on it:

In [None]:
text8.collocations()

#Creating a list of the lemmatized versions of all the words in text8:

In [None]:
 lemmatized_words = [lemmatizer.lemmatize(word) for word in text8]

In [None]:
new_text = nltk.Text(lemmatized_words)

Here’s how to see the collocations in your new_text:



In [None]:
new_text.collocations()

Compared to your previous list of collocations, this new one is missing a few:

- weekends away
- poss rship


The idea of quiet nights still shows up in the lemmatized version, quiet night. Your latest search for collocations also brought up a few news ones:

- year old suggests that users often mention ages.
- photo pls suggests that users often request one or more photos.
That’s how you can find common word combinations to see what people are talking about and how they’re talking about it!