# Session 3 - working with `spaCy`

In [1]:
import spacy

In [3]:
# loading the spacy model
# define pipeline
nlp = spacy.load("en_core_web_md")

type(nlp) # spacy.lang.en.English - the nlp variable is a spacy model

spacy.lang.en.English

In [21]:
# sample sentence
text = "Hello, I hope you're doing great! I'm Ida"

In [22]:
# create spaCy doc
doc = nlp(text)

In [8]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
print(doc)

Hello, I hope you're doing great!


In [23]:
# go through each token one at a time - using for loop 
for token in doc:
    print(token.text)


# the package is opinnionated - so it has it's own way of tokenizing without we define it 

Hello
,
I
hope
you
're
doing
great
!
I
'm
Ida


In [24]:
# by running it through the spacy pipeline the individual tokens gets a series of attributes attached to each token 
# see some attributes 
for token in doc: 
    #get index, text, label, dependency relations between words, nuance, 
    print(token.i, token.text, token.pos_, token.dep_, token.morph)

0 Hello INTJ intj 
1 , PUNCT punct PunctType=Comm
2 I PRON nsubj Case=Nom|Number=Sing|Person=1|PronType=Prs
3 hope VERB ROOT Tense=Pres|VerbForm=Fin
4 you PRON nsubj Case=Nom|Person=2|PronType=Prs
5 're AUX aux Mood=Ind|Tense=Pres|VerbForm=Fin
6 doing VERB ccomp Aspect=Prog|Tense=Pres|VerbForm=Part
7 great ADV dobj 
8 ! PUNCT punct PunctType=Peri
9 I PRON nsubj Case=Nom|Number=Sing|Person=1|PronType=Prs
10 'm AUX ROOT Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
11 Ida PROPN npadvmod Number=Sing


In [25]:
# NER - named entity recognition

for entity in doc.ents:
    print(entity.text, entity.label_)

Ida PERSON


# Working with ```pandas```

In [26]:

import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [50]:
# make empty list and append the attributes 
annotations = []
for token in doc:
    annotations.append([token.text, token.pos_, token.dep_])


In [51]:
# create a dataframe from the list of lists using pandas -  with column names 
df = pd.DataFrame(annotations, 
                    columns = ["text", "pos", "dep"]) 

In [49]:
df

Unnamed: 0,text,pos,dep
0,Hello,INTJ,intj
1,",",PUNCT,punct
2,I,PRON,nsubj
3,hope,VERB,ROOT
4,you,PRON,nsubj
5,'re,AUX,aux
6,doing,VERB,ccomp
7,great,ADV,dobj
8,!,PUNCT,punct
9,I,PRON,nsubj


In [52]:
# slice data into individual columns and rows
# series = a single column
df["pos"].value_counts() # count how many times each value appears in this column

pos
PRON     3
PUNCT    2
VERB     2
AUX      2
INTJ     1
ADV      1
PROPN    1
Name: count, dtype: int64

In [53]:
# save the output
df.to_csv("annotations.csv")

In [54]:
# read it 
input_df = pd.read_csv("annotations.csv")
input_df

Unnamed: 0.1,Unnamed: 0,text,pos,dep
0,0,Hello,INTJ,intj
1,1,",",PUNCT,punct
2,2,I,PRON,nsubj
3,3,hope,VERB,ROOT
4,4,you,PRON,nsubj
5,5,'re,AUX,aux
6,6,doing,VERB,ccomp
7,7,great,ADV,dobj
8,8,!,PUNCT,punct
9,9,I,PRON,nsubj


# Assignment