<a href="https://colab.research.google.com/github/MapariPrajwal/NLP/blob/main/NLP_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer, LineTokenizer, SpaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
import requests

In [2]:
# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Function to load text from different sources
def load_text_from_file(file_path):
    if file_path.endswith(".pdf"):
        # Code to extract text from PDF
        raise NotImplementedError("PDF extraction not implemented.")
    elif file_path.endswith(".docx"):
        # Code to extract text from DOCX
        raise NotImplementedError("DOCX extraction not implemented.")
    else:
        with open(file_path, 'r') as file:
            return file.read()

In [4]:
def load_text_from_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = ' '.join([p.get_text() for p in soup.find_all('p')])
    return text

In [13]:
# You can replace these paths/URLs with your own data sources
corpus = ""
file_paths = ["/content/NLP.txt", "/content/sample3.txt"]
for file_path in file_paths:
    if os.path.exists(file_path):
        corpus += load_text_from_file(file_path)
    else:
        print(f"File {file_path} not found.")

website_url = "https://filesamples.com/formats/txt"
corpus += load_text_from_website(website_url)

In [14]:
# Tokenization
line_tokens = LineTokenizer().tokenize(corpus)
space_tokens = SpaceTokenizer().tokenize(corpus)
word_tokens = word_tokenize(corpus)
tweet_tokens = TweetTokenizer().tokenize(corpus)

In [15]:
# Stemming
stemmer = PorterStemmer()
stemmed_corpus = [stemmer.stem(token) for token in word_tokens]

In [16]:
# Unique words
unique_words = set(word_tokens)
num_unique_words = len(unique_words)

In [17]:
# Type-Token Ratio (TTR)
ttr = num_unique_words / len(word_tokens)

In [18]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_corpus = [lemmatizer.lemmatize(token, wordnet.VERB) for token in word_tokens]

In [11]:
# Stop-word removal
stop_words = set(stopwords.words('english'))
filtered_corpus = [token for token in word_tokens if token.lower() not in stop_words]

In [19]:
# Print results
print("Line tokens:", line_tokens)
print("Space tokens:", space_tokens)
print("Word tokens:", word_tokens)
print("Tweet tokens:", tweet_tokens)
print("Stemmed corpus:", stemmed_corpus)
print("Number of unique words:", num_unique_words)
print("Type-Token Ratio (TTR):", ttr)
print("Lemmatized corpus:", lemmatized_corpus)
print("Filtered corpus (stop-words removed):", filtered_corpus)

Line tokens: ['NLP Task', '1 named entity recognition model', '2. Remove words which have no significant meaning but are repeated unnecessarily', '3. find the most common words in the text excluding regularly used words such as [is,am, are, was were ,', 'he , she it]', '4. Extract all the verbs in a text?', '5. If you are given 2 words, distance and path, how do you know that these words are similar? How can', 'we find this similarity as a number between 0 to 1.', '6 How to replace all the pronouns in a text with their respective object names', '7. how to find keywords (words that have significant meanings in the text)', '8. How to classify a text as positive/negative sentiment', '9. How to use the Word2Vec model for representing words?', '10. How to identify words that always occur together for eg. "smart phone" these 2 words from a single', 'word for humans but they are 2 different words for machines, how to track such words?', 'Use following Paragraphs for completing task :', "Harry