In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
  print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [None]:
from spacy import displacy

displacy.render(doc, style="ent")

'<div class="entities" style="line-height: 2.5; direction: ltr">\n<mark class="entity" style="background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Tesla Inc\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">ORG</span>\n</mark>\n is going to acquire twitter for \n<mark class="entity" style="background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    $45 billion\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">MONEY</span>\n</mark>\n</div>'

In [None]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [None]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")

for ent in doc.ents:
  print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Michael Bloomberg | PERSON | People, including fictional
Bloomberg | PERSON | People, including fictional
1982 | DATE | Absolute or relative dates or periods


In [None]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
  print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


STOP WORDS-->

In [None]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
  if token.is_stop:
    print(token)

We
just
our
the
part
is


In [None]:
def preprocess(text):
  doc = nlp(text)

  no_stop_words = [token.text for token in doc if not token.is_stop]
  return no_stop_words

In [None]:
preprocess("We just opened our wings, the flying part is coming soon")

['opened', 'wings', ',', 'flying', 'coming', 'soon']

In [None]:
def preprocess(text):
  doc = nlp(text)

  no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
  return " ".join(no_stop_words)

In [None]:
preprocess("Musk wants time to prepare for a trial over this")

'Musk wants time prepare trial'

In [None]:
#install kaggle
!pip install kaggle




In [None]:
from google.colab import files
files.upload()

Saving kaggle (1).json to kaggle (1).json


{'kaggle (1).json': b'{"username":"mohitranjan6637","key":"2615ca0a3988ec4d9db9797284503e3c"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list


ref                                                           title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
iamsouravbanerjee/customer-shopping-trends-dataset            Customer Shopping Trends Dataset                   146KB  2023-10-05 06:45:37          16095        345  1.0              
nelgiriyewithana/billionaires-statistics-dataset              Billionaires Statistics Dataset (2023)             139KB  2023-09-29 13:39:28           8660        216  1.0              
jocelyndumlao/consumer-review-of-clothing-product             Consumer Review of Clothing Product                  4MB  2023-10-19 04:45:23           1027         28  1.0              
victorahaji/worlds-air-quality-and-water-pollution-dataset    World's Air Q

In [None]:
!kaggle datasets download -d jbencina/department-of-justice-20092018-press-releases

Downloading department-of-justice-20092018-press-releases.zip to /content
 87% 12.0M/13.8M [00:01<00:00, 16.1MB/s]
100% 13.8M/13.8M [00:01<00:00, 11.2MB/s]


In [None]:
#unzip the dataset

!unzip departmen_of_justice.zip -d example-dataset

Archive:  departmen_of_justice.zip
  inflating: example-dataset/combined.json  


In [None]:
import pandas as pd

df = pd.read_json("/content/example-dataset/combined.json", lines=True)
df.shape

(13087, 6)

In [None]:
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [None]:
df = df[df['topics'].str.len()!=0]
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [None]:
df.shape

(4688, 6)

In [None]:
df = df.head(100)
df.shape

(100, 6)

In [None]:
len(df['contents'].iloc[4])

5504

In [None]:
df['contents_new'] = df['contents'].apply(preprocess)
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components,contents_new
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division],U.S. Department Justice U.S. Environmental Pro...
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division],131 count criminal indictment unsealed today B...
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U...",United States Attorney Office Middle District ...
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division],21st Century Oncology LLC agreed pay $ 19.75 m...
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]",21st Century Oncology Inc. certain subsidiarie...


In [None]:
len(df['contents_new'].iloc[4])

4217

In [None]:
print(len(df['contents'].iloc[7]))
print(len(df['contents_new'].iloc[7]))

5081
3939


In [None]:
df["contents_new"].iloc[7]

'Federal agents arrested 17 members associates Trey Gangster Bloods criminal street gang charges RICO conspiracy related charges stemming indictment returned federal grand jury Northern District Georgia Oct. 12 unsealed yesterday \xa0  total 30 gang members associates indicted Acting Assistant Attorney General Kenneth A. Blanco Justice Department Criminal Division U.S. Attorney Byung J. BJay Pak Northern District Georgia Special Agent Charge David J. LeValley FBI Atlanta Field Office announcement indictment continues Department efforts bring justice leaders violent members dangerous criminal enterprises like Trey Gangsters said Acting Assistant Attorney General Kenneth A. Blanco \xa0  charging responsible violence drug dealing perpetrated members associates violent street gangs like Trey Gangsters making neighborhoods communities safer alleged crimes relate drug distribution Atlanta area acts violence perpetrated largely gang members said U.S. Attorney Pak \xa0  Shockingly Trey Gangste

Exercise1:

From a Given Text, Count the number of stop words in it.
Print the percentage of stop word tokens compared to all tokens in a given text.

In [None]:
text = '''
Thor: Love and Thunder is a 2022 American superhero film based on Marvel Comics featuring the character Thor, produced by Marvel Studios and
distributed by Walt Disney Studios Motion Pictures. It is the sequel to Thor: Ragnarok (2017) and the 29th film in the Marvel Cinematic Universe (MCU).
The film is directed by Taika Waititi, who co-wrote the script with Jennifer Kaytin Robinson, and stars Chris Hemsworth as Thor alongside Christian Bale, Tessa Thompson,
Jaimie Alexander, Waititi, Russell Crowe, and Natalie Portman. In the film, Thor attempts to find inner peace, but must return to action and recruit Valkyrie (Thompson),
Korg (Waititi), and Jane Foster (Portman)—who is now the Mighty Thor—to stop Gorr the God Butcher (Bale) from eliminating all gods.
'''

In [None]:
len(text)

767

In [None]:
def preprocessing(text):
  doc = nlp(text)

  no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
  return " ".join(no_stop_words)

In [None]:
len(preprocessing(text))

602

In [None]:
#Final result to the count of number of stop words ->>
Count_words = (len(text) - len(preprocessing(text)))
print("Number of stop words:", Count_words)

Number of stop words: 165


In [None]:
#Percentage of stop words compare to all tokens ->>

Stp_words_perc = ((Count_words))*100/(len(text))
print("Number of count words:", Stp_words_perc)

Number of count words: 21.51238591916558


Exercise2:

Spacy default implementation considers "not" as a stop word. But in some scenarios removing 'not' will completely change the meaning of the statement/text. For Example, consider these two statements:

- this is a good movie       ----> Positive Statement
- this is not a good movie   ----> Negative Statement

So, after applying stopwords to those 2 texts, both will return "good movie" and does not respect the polarity/sentiments of text.

Now, your task is to remove this stop word "not" in spaCy and help in distinguishing the texts.

In [None]:
text1 = "this is not a good movie"
text2 = "this is a good movie"

In [None]:
def preprocess(text):
    doc = nlp(text)
    no_stop_words = [token.text for token in doc if not (token.is_stop and token.text.lower() != 'not')]
    return " ".join(no_stop_words)

In [None]:
print(preprocess(text1))
print(preprocess(text2))

not good movie
good movie
