In [None]:
# The process of converting data to something a computer can understand is referred to as pre-processing. 
# One of the major forms of pre-processing is to filter out useless data. In natural language processing, 
# useless words (data), are referred to as stop words

# Stop Words: A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed
# to ignore, both when indexing entries for searching and when retrieving them as the result of a search query. 

![image.png](attachment:image.png)

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aicyb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
# Removing stop words with NLTK

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
example_sent = """This is a sample sentence,
                  showing off the stop words filtration."""
 
stop_words = set(stopwords.words('english'))
 
word_tokens = word_tokenize(example_sent)
 
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
 
filtered_sentence = []
 
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
 
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [9]:
# Performing the Stopwords operations in a file

import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
# word_tokenize accepts
# a string as an input, not a file.
stop_words = set(stopwords.words('english'))
file1 = open("d:/data/text.txt")
 
# Use this to read file content as a stream:
line = file1.read()
words = line.split()
for r in words:
    if not r in stop_words:
        appendFile = open('d:/data/text.txt','a')
        appendFile.write(" "+r)
        appendFile.close()

In [11]:
line

'A file with .TXT extension represents a text document that contains plain text in the form of lines. Paragraphs in a text document are recognized by carriage returns and are used for better arrangement of file contents. A standard text document can be opened in any text editor or word processing application on different operating systems. All the text contained in such a file is in human-readable format and represented by sequence of characters.\n\nText files can store large amount of data as there is no limitation on the size of contents. However, text editors opening such large files need to be smart for loading and displaying these. Almost all operating systems come with text editors that allow you to create and edit text files. For example, Windows OS comes with Notepad and Wordpad for this purpose. Similarly, MacOS comes with TextEdit for creating and editing Text Documents. There are, however, other free text editors available as well over the internet that provide you the capab

In [12]:
words

['A',
 'file',
 'with',
 '.TXT',
 'extension',
 'represents',
 'a',
 'text',
 'document',
 'that',
 'contains',
 'plain',
 'text',
 'in',
 'the',
 'form',
 'of',
 'lines.',
 'Paragraphs',
 'in',
 'a',
 'text',
 'document',
 'are',
 'recognized',
 'by',
 'carriage',
 'returns',
 'and',
 'are',
 'used',
 'for',
 'better',
 'arrangement',
 'of',
 'file',
 'contents.',
 'A',
 'standard',
 'text',
 'document',
 'can',
 'be',
 'opened',
 'in',
 'any',
 'text',
 'editor',
 'or',
 'word',
 'processing',
 'application',
 'on',
 'different',
 'operating',
 'systems.',
 'All',
 'the',
 'text',
 'contained',
 'in',
 'such',
 'a',
 'file',
 'is',
 'in',
 'human-readable',
 'format',
 'and',
 'represented',
 'by',
 'sequence',
 'of',
 'characters.',
 'Text',
 'files',
 'can',
 'store',
 'large',
 'amount',
 'of',
 'data',
 'as',
 'there',
 'is',
 'no',
 'limitation',
 'on',
 'the',
 'size',
 'of',
 'contents.',
 'However,',
 'text',
 'editors',
 'opening',
 'such',
 'large',
 'files',
 'need',
 'to'