In [1]:
!pip install nltk spacy pandas matplotlib textstat
!python -m spacy download en_core_web_sm
!python -m nltk.downloader punkt stopwords wordnet



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m121.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up

In [6]:
import re
import nltk
import pandas as pd
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer

import spacy
from spacy import displacy

import textstat  # instead of readability

import nltk

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.tokenize import word_tokenize

text = "Text, text, text"
clean = text.lower()

tokens = word_tokenize(clean)
print(tokens)


stops = set(stopwords.words("english"))
snowball = SnowballStemmer("english")
porter = PorterStemmer()
wn = WordNetLemmatizer()

# basic text cleaning
text = "text, text, text"

clean = re.sub(r"[^a-zA-Z\s]", " ", text)
clean = clean.lower()
tokens = word_tokenize(clean)
tokens = [t for t in tokens if t not in stops]
print("Tokens without stopwords:", tokens)

# spaCy example
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is buying a UK startup for $1 billion.")

print("Entities:", [(ent.label_, ent.text) for ent in doc.ents])

for token in doc:
    print(token.text, token.pos_, token.dep_)


displacy.render(doc, style="dep", jupyter=True)

# stemming/lemmatization
word = "has"
print("Snowball stem of 'has':", snowball.stem(word))
print("Porter stem of 'languages':", porter.stem("languages"))
print("WordNet lemma of 'has':", wn.lemmatize(word, pos="v"))


sample_text = "This is a simple example sentence. It should be easy to read."
fk_score = textstat.flesch_reading_ease(sample_text)
print("Flesch Reading Ease score:", fk_score)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['text', ',', 'text', ',', 'text']
Tokens without stopwords: ['text', 'text', 'text']
Entities: [('ORG', 'Apple'), ('GPE', 'UK'), ('MONEY', '$1 billion')]
Apple PROPN nsubj
is AUX aux
buying VERB ROOT
a DET det
UK PROPN dobj
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj
. PUNCT punct


Snowball stem of 'has': has
Porter stem of 'languages': languag
WordNet lemma of 'has': have
Flesch Reading Ease score: 80.89500000000001
