In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **What are stop words?**
the most common words in any language and does not add much information to the text. Examples of a few stop words in English are “the”, “a”, “an”, “so”, “what”

# **Why do we remove stop words?**
Stop words are available in any human language. By removing these words, we remove the low-level information from our text

**Removal of stop words definitely reduces the dataset size and thus reduces the training time due to the fewer number of tokens involved in the training. **

# **EX:**
# Movie review: “The movie was not good at all.”
# Text after removal of stop words: “movie good”

In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('english')
print(sw)
print(len(sw))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'ea

In [4]:
#split this text into words as stop words is a list of words. 
# changed the words to lowercase as all words in the list of stop words are in lowercase.
text = "When I first met her she was very quiet. She remained quiet during the entire two hours long journey from Stony Brook to New York."
print(text.lower())
print(text.split())

when i first met her she was very quiet. she remained quiet during the entire two hours long journey from stony brook to new york.
['When', 'I', 'first', 'met', 'her', 'she', 'was', 'very', 'quiet.', 'She', 'remained', 'quiet', 'during', 'the', 'entire', 'two', 'hours', 'long', 'journey', 'from', 'Stony', 'Brook', 'to', 'New', 'York.']


In [None]:
# Let us remove stop words from a text.
text = "When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York."
words = [word for word in text.split() if word.lower() not in sw]
new_text = " ".join(words)
print(new_text)
print("Old length: ", len(text))
print("New length: ", len(new_text))

first met quiet. remained quiet entire two hour long journey Stony Brook New York.
Old length:  129
New length:  82


 **Can I add my own stop words to the list?**
**yes, we can also add custom stop words to the list of stop words**

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
sw = stopwords.words('english')
print(len(sw))

sw.extend(['first', 'second', 'third', 'me'])
print(len(sw))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
179
183


 **Can I remove stop words from the list?**
**Yes**

In [None]:
sw.remove('not')
print(len(sw))

182


 **create your custom stop words list**

In [5]:
text = "When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York."
my_stop_words = ['her','me','i','she','it']
words = [word for word in text.split() if word.lower() not in my_stop_words]
new_text = " ".join(words)
print(new_text)
print("Old length: ", len(text))
print("New length: ", len(new_text))

When first met was very quiet. remained quiet during the entire two hour long journey from Stony Brook to New York.
Old length:  129
New length:  115


# **Stop words using spacy**

In [16]:
import spacy
nlp= spacy.load(("en_core_web_sm"))
stopword=nlp.Defaults.stop_words
print(stopword)
print(len(stopword),'\n########################################################','\n########################################################')

text='When I first met her she was very quiet. She remained quiet during the entire two hour long journey from Stony Brook to New York.'
doc=nlp(text)
for i in doc:
  if i not in stopword:
    print(i)

{'me', 'two', 'whence', 'therein', 'rather', 'are', 'full', 'further', 'toward', 'is', 'became', 'ten', '‘s', 'am', "'ve", 'yourselves', 'sometime', 'most', 'that', 'however', 'until', 'hence', 'towards', 'ours', 'hereby', 'nevertheless', 'side', 'herein', 'much', 'see', 'thus', 'been', 'a', 'former', 'for', 'done', 'but', 'using', 'by', 'everyone', 'without', 'somewhere', 'third', 'your', 'quite', 'be', 'were', 'often', 'else', 'though', 'being', 'beforehand', 'his', 'within', 'whom', 'well', 'get', 'our', 'becoming', 'must', 'how', 'therefore', 'no', 'even', 'six', 'whole', 'should', 'many', 'never', 'back', 'any', 'becomes', '’ve', '’re', 'what', 'ourselves', 'off', 'give', 'very', 'still', 'behind', 'mine', 'if', 'other', 'next', 'since', 'this', 'nowhere', 'besides', 'could', 'was', 'may', 'go', 'indeed', 'themselves', 'nine', 'seemed', 'us', 'three', 'would', 'one', 'keep', 'might', 'did', 'hereupon', 'after', 'then', 'will', 'about', 'mostly', 'the', "'s", 'can', 'empty', 'beyon

**Adding and removing a stop word to the default spaCy**

In [17]:
nlp.Defaults.stop_words.add('btw')
nlp.Defaults.stop_words.remove('hers')