In [None]:
!pip install contractions

In [None]:
import pandas as pd
data = '../Desktop/train_processed.csv'
train_df = pd.read_csv(data)
print(train_df.columns)
print(train_df["Text_Processed"].head())

In [None]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import contractions

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Converts POS tags to a format recognized by WordNet lemmatizer."""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(df, text_column='Text'):
    """
    Preprocesses text data by:
    - Converting to lowercase
    - Removing punctuation
    - Removing HTML and URLs
    - Removing digits
    - Expanding contractions
    - Removing stopwords
    - Tokenizing
    - Lemmatizing
    
    Args:
        df (pd.DataFrame): DataFrame containing text data.
        text_column (str): Name of the column containing text.
    
    Returns:
        pd.DataFrame: Processed DataFrame with a new column 'Text_Processed'.
    """
    df = df.copy()
    df['Text_Processed'] = df[text_column].astype(str).str.lower()
    df['Text_Processed'] = df['Text_Processed'].str.translate(str.maketrans('', '', string.punctuation))
    
    # Remove HTML anchor tags and URLs
    df['Text_Processed'] = df['Text_Processed'].replace(r'<a\s+href="http[^"]*">|</a>', '', regex=True)
    
    # Remove digits
    df['Text_Processed'] = df['Text_Processed'].replace(r'\d+', '', regex=True)
    
    # Expand contractions and remove stopwords
    stop_words = set(stopwords.words('english'))
    def expand_and_remove_stopwords(text):
        expanded_text = contractions.fix(text)
        words = expanded_text.split()
        return ' '.join([word for word in words if word.lower() not in stop_words])
    
    df['Text_Processed'] = df['Text_Processed'].apply(expand_and_remove_stopwords)
    
    # Tokenization and Lemmatization
    def tokenize_and_lemmatize(text):
        tokens = word_tokenize(text)
        return ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens])
    
    df['Text_Processed'] = df['Text_Processed'].apply(tokenize_and_lemmatize)
    
    # Save to new CSV file
    df.to_csv('train_processed.csv', index=False)
    return df

# Example usage (uncomment when running with actual data)
train_df = preprocess_text(train_df)