In [1]:
# lemmatization.ipynb

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import pandas as pd

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Map POS tag to first character lemmatize() accepts
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    words = word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_words)

# Load the cleaned dataset
df = pd.read_csv('../data/processed/cleaned_tweets.csv')

# Apply lemmatization
df['lemmatized_tweet'] = df['tweet'].apply(lemmatize_text)

# Save the lemmatized data
df.to_csv('../data/processed/lemmatized_tweets.csv', index=False)

# Display some examples
df.head()


ModuleNotFoundError: No module named 'nltk'