In [1]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import nltk

In [2]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [3]:
def get_wordnet_pos(word):
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

In [4]:
def preprocess_text(text):
    
    tokens = simple_preprocess(text, deacc=True)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in STOPWORDS]
    
    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in stemmed_tokens]
    
    return lemmatized_tokens

In [5]:
def process_file(file_path):
    try:
        with open(file_path, 'r') as file:
            text = file.read()
        return preprocess_text(text)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []

In [10]:
if __name__ == "__main__":
    file_path = "sample.txt"
    
    preprocessed_tokens = process_file(file_path)
    
    print("Preprocessed Tokens:")
    print(preprocessed_tokens)

Preprocessed Tokens:
['gensim', 'popular', 'python', 'librari', 'topic', 'model', 'document', 'similar', 'analysi', 'natur', 'languag', 'process', 'provid', 'robust', 'tool', 'text', 'preprocess', 'like', 'token', 'remov', 'stopword', 'strip', 'punctuat', 'stem', 'util', 'text', 'data', 'readi', 'machin', 'learn', 'nlp', 'task']


In [11]:
print(" ".join(preprocessed_tokens))

gensim popular python librari topic model document similar analysi natur languag process provid robust tool text preprocess like token remov stopword strip punctuat stem util text data readi machin learn nlp task
