# Install

In [None]:
!pip install -U spacy
!python -m spacy download en

In [None]:
!pip install nltk

In [None]:
!pip install prettytable

## Tokenization

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Data Science Dojo is the leading platforms providing data science training.")
for token in doc:
    print(token.text)

Data
Science
Dojo
is
the
leading
platforms
providing
data
science
training
.


## POS Tagging

In [5]:
import spacy
from prettytable import PrettyTable

table = PrettyTable(['Token', 'Part of speech', 'Tag'])

nlp = spacy.load("en_core_web_sm")
doc = nlp("Data Science Dojo is the leading platforms providing data science training.")

for token in doc:
    table.add_row([token.text, token.pos_, token.tag_])
print(table)

+-----------+----------------+-----+
|   Token   | Part of speech | Tag |
+-----------+----------------+-----+
|    Data   |     PROPN      | NNP |
|  Science  |     PROPN      | NNP |
|    Dojo   |     PROPN      | NNP |
|     is    |      AUX       | VBZ |
|    the    |      DET       |  DT |
|  leading  |      VERB      | VBG |
| platforms |      NOUN      | NNS |
| providing |      VERB      | VBG |
|    data   |      NOUN      | NNS |
|  science  |      NOUN      |  NN |
|  training |      NOUN      |  NN |
|     .     |     PUNCT      |  .  |
+-----------+----------------+-----+


## Dependency Parsing

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Data Science Dojo is the leading platforms providing data science training.")

spacy.displacy.render(doc, style="dep")

## Create the Dependency Plot svg file in the local system

In [None]:
from pathlib import Path

dependency_plot = spacy.displacy.render(doc, style="dep", jupyter=False)

output_path = Path("dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
output_path.open("w", encoding="utf-8").write(dependency_plot)

## Lemmatization

In [8]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
doc = nlp("Data Science Dojo is the leading platforms providing data science training.")

lemmatized = [token.lemma_ for token in doc]

print("Original: \n", doc)
print("\nAfter Lemmatization: \n", " ".join(lemmatized))

Original: 
 Data Science Dojo is the leading platforms providing data science training.

After Lemmatization: 
 Data Science Dojo be the lead platform provide datum science training .


In [9]:
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize  

ps = PorterStemmer()
 
sentence = "Data Science Dojo is the leading platforms providing data science training."
words = word_tokenize(sentence)
stemmed = [ps.stem(token) for token in words] 

print("Original: \n", " ".join(words))
print("\nAfter Stemming: \n", " ".join(stemmed))

Original: 
 Data Science Dojo is the leading platforms providing data science training .

After Stemming: 
 data scienc dojo is the lead platform provid data scienc train .


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alamf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Stop Word Removal

In [10]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Data Science Dojo is the leading platforms providing data science training.")

token_list = [ token.text for token in doc ]
filtered_sentence = [ word for word in token_list if nlp.vocab[word].is_stop == False ] 

print("Tokens:\n",token_list)
print("\nAfter stop word removal:\n", filtered_sentence)   

Tokens:
 ['Data', 'Science', 'Dojo', 'is', 'the', 'leading', 'platforms', 'providing', 'data', 'science', 'training', '.']

After stop word removal:
 ['Data', 'Science', 'Dojo', 'leading', 'platforms', 'providing', 'data', 'science', 'training', '.']


## Named Entity Recognition

In [11]:
import spacy
from prettytable import PrettyTable

nlp = spacy.load("en_core_web_sm")
doc = nlp("When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun in an interview with Recode earlier this week. A little less than a decade later, dozens of self-driving startups have cropped up while automakers around the world clamour, wallet in hand, to secure their place in the fast-moving world of fully automated transportation.")

table = PrettyTable(["Entity", "Start Position", "End Position", "Label"])

for ent in doc.ents:
    table.add_row([ent.text, ent.start_char, ent.end_char, ent.label_])
print(table)
spacy.displacy.render(doc, style="ent")

+-------------------+----------------+--------------+----------+
|       Entity      | Start Position | End Position |  Label   |
+-------------------+----------------+--------------+----------+
|  Sebastian Thrun  |       5        |      20      |  PERSON  |
|        2007       |       71       |      75      |   DATE   |
|      American     |      173       |     181      |   NORP   |
|       Thrun       |      271       |     276      |   GPE    |
|       Recode      |      298       |     304      |   ORG    |
| earlier this week |      305       |     322      |   DATE   |
|   a decade later  |      343       |     357      |   DATE   |
|       dozens      |      359       |     365      | CARDINAL |
+-------------------+----------------+--------------+----------+


In [None]:
from pathlib import Path
from spacy import displacy

ent = displacy.render(doc, style="ent", jupyter=False)
output_path = Path("ent_plot.svg")
output_path.open("w", encoding="utf-8").write(ent)