In [1]:
# IMPORTS AND INITIALISATION
import spacy
import pandas as pd

nlp = spacy.load('en')

In [2]:
# TOKENISATION
doc = nlp('Hey there. I am Ishaan Ohri. I study in Vellore Institute of Technology.')

df = pd.DataFrame()
for i, token in enumerate(doc):
  df.loc[i, 'word'] = token.text

df

Unnamed: 0,word
0,Hey
1,there
2,.
3,I
4,am
5,Ishaan
6,Ohri
7,.
8,I
9,study


In [3]:
# POS TAGS
doc = nlp('The quick brown fox jumps over the lazy dog')

df = pd.DataFrame()
for i, token in enumerate(doc):
  df.loc[i, 'word'] = token.text
  df.loc[i, 'POS'] = token.pos_

df

Unnamed: 0,word,POS
0,The,DET
1,quick,ADJ
2,brown,ADJ
3,fox,NOUN
4,jumps,NOUN
5,over,ADP
6,the,DET
7,lazy,ADJ
8,dog,NOUN


In [4]:
# LEMMATISATION
doc = nlp('I am running in the park. I am practicing for my competition.')

df = pd.DataFrame()
for i, token in enumerate(doc):
  if not token.is_stop:
    df.loc[i, 'word'] = token.text
    df.loc[i, 'Lemma'] = token.lemma_

df

Unnamed: 0,word,Lemma
2,running,run
5,park,park
6,.,.
9,practicing,practice
12,competition,competition
13,.,.


In [5]:
# ENTITY RECOGNITION
doc = nlp("India is a democratic country with a population of 1.3 billion.")

df = pd.DataFrame()
for i, token in enumerate(doc.ents):
  df.loc[i, 'word'] = token.text
  df.loc[i, 'start char'] = token.start_char
  df.loc[i, 'end char'] = token.end_char
  df.loc[i, 'label'] = token.label_

df

Unnamed: 0,word,start char,end char,label
0,India,0.0,5.0,GPE
1,1.3 billion,51.0,62.0,CARDINAL


In [6]:
# STOP WORD REMOVAL
doc = nlp("India is a democratic country with a population of 1.3 billion.")

df = pd.DataFrame()
for i, token in enumerate(doc):
  if not token.is_stop:
    df.loc[i, 'word'] = token.text

df

Unnamed: 0,word
0,India
3,democratic
4,country
7,population
9,1.3
10,billion
11,.
