In [None]:
!pip install contractions

In [127]:
import pandas as pd
data = '../Downloads/train.csv'
train_df = pd.read_csv(data)
print(train_df.columns)
print(train_df["Text"].head())

Index(['Score', 'Text'], dtype='object')
0    I received this product early from the seller!...
1    *****<br />Numi's Collection Assortment Melang...
2    I was very careful not to overcook this pasta,...
3    Buying this multi-pack I was misled by the pic...
4    These bars are so good! I loved them warmed up...
Name: Text, dtype: object


In [69]:
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import contractions

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Converts POS tags to a format recognized by WordNet lemmatizer."""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(df, text_column='Text'):
    """
    Preprocesses text data by:
    - Converting to lowercase
    - Removing punctuation
    - Removing HTML and URLs
    - Removing digits
    - Expanding contractions
    - Removing stopwords
    - Tokenizing
    - Lemmatizing
    
    Args:
        df (pd.DataFrame): DataFrame containing text data.
        text_column (str): Name of the column containing text.
    
    Returns:
        pd.DataFrame: Processed DataFrame with a new column 'Text_Processed'.
    """
    df = df.copy()
    
    df['Text_Processed'] = df[text_column].replace(r'<a\s+href="http[^"]*">|</a>', '', regex=True)
    
    df['Text_Processed'] = df['Text_Processed'].astype(str).str.lower()
    
    df['Text_Processed'] = df['Text_Processed'].str.translate(str.maketrans('', '', string.punctuation))
    
    # Remove digits
    df['Text_Processed'] = df['Text_Processed'].replace(r'\d+', '', regex=True)
    
    # Expand contractions and remove stopwords
    stop_words = set(stopwords.words('english'))
    def expand_and_remove_stopwords(text):
        expanded_text = contractions.fix(text)
        words = expanded_text.split()
        return ' '.join([word for word in words if word.lower() not in stop_words])
    
    df['Text_Processed'] = df['Text_Processed'].apply(expand_and_remove_stopwords)
    
    # Tokenization and Lemmatization
    def tokenize_and_lemmatize(text):
        tokens = word_tokenize(text)
        return ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens])
    
    df['Text_Processed'] = df['Text_Processed'].apply(tokenize_and_lemmatize)
    
    # Save to new CSV file
    df.to_csv('train_processed.csv', index=False)
    return df

train_df = preprocess_text(train_df)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/geraint/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/geraint/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/geraint/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/geraint/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/geraint/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [133]:
data = '../Downloads/test.csv'
test_df = pd.read_csv(data)
print(test_df.columns)
print(test_df["Text"].head())

Index(['Id', 'Text'], dtype='object')
0    Fog chaser was the best both in flavor and bod...
1    We found this at PF Changs, and it tastes just...
2    Not quite a chocolate bar substitute but delic...
3    This is not as tasty as Pamela's Almond Biscot...
4    <a href="http://www.amazon.com/gp/product/B000...
Name: Text, dtype: object


In [134]:
import html

problematic_text = test_df.loc[98060, "Text"]

try:
    decoded_text = html.unescape(problematic_text)
    print(f"\nDecoded Text: {decoded_text}")
    train_df.loc[98060, "Text"] = decoded_text 
except Exception as e:
    print(f"\nDecoding failed: {e}")
    decoded_text = problematic_text  

print(train_df.iloc[98055:98065])


Decoded Text: Решила заказать эти желатинки, т.к. дочка пошла в школу и требует сладости. Очень удобная упаковка. Как раз столько конфет, как и требуется, чтобы сьесть их за перемену.
       Score                                               Text
98055      5  I had great feeling hair after using this prod...
98056      5  I've recently brought the Skinnygirl Pina Cola...
98057      2  I love the product.  But, the bottle design is...
98058      5  It's so nice to have a vegetarian Jello.  I ad...
98059      3  I get this olive oil through Subscribe and Sav...
98060      3  Решила заказать эти желатинки, т.к. дочка пошл...
98061      1  This WAS a great everyday coffee until Green M...
98062      1  I bought these because of the good reviews and...
98063      5  I'm so thankful that Trader Joe's is selling s...
98064      5  Excellent crunch. Very fresh. Big size bags. T...


In [137]:
# Manual English translation
english_translation = "I decided to order these gelatins because my daughter went to school and wants sweets. Very convenient packaging. Just the right amount of candy to eat during the break."

test_df.loc[98060, 'Text'] = english_translation
print(test_df.iloc[98055:98065])

          Id                                               Text
98055  98055  There are some people in this world who really...
98056  98056  Very aromatic; equally good, if not better, th...
98057  98057  These are seriously great. I just bought a sec...
98058  98058  Fresh and tasty corn.  I just got back into ma...
98059  98059  this is the most delicious flavor of coffee an...
98060  98060  I decided to order these gelatins because my d...
98061  98061  It is the same as the spaghetti. It tastes goo...
98062  98062  My whole family loves this product, I find tha...
98063  98063  I'm a big fan of the higher end blancos, serve...
98064  98064  Sugar is toxic. Why wouldn't anyone use Stevia...


In [None]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Converts POS tags to a format recognized by WordNet lemmatizer."""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def preprocess_text(df, text_column='Text'):
    """
    Preprocesses text data by:
    - Converting to lowercase
    - Removing punctuation
    - Removing HTML and URLs
    - Removing digits
    - Expanding contractions
    - Removing stopwords
    - Tokenizing
    - Lemmatizing
    
    Args:
        df (pd.DataFrame): DataFrame containing text data.
        text_column (str): Name of the column containing text.
    
    Returns:
        pd.DataFrame: Processed DataFrame with a new column 'Text_Processed'.
    """
    df = df.copy()
    
    df['Text_Processed'] = df[text_column].replace(r'<a\s+href="http[^"]*">|</a>', '', regex=True)
    
    df['Text_Processed'] = df['Text_Processed'].astype(str).str.lower()
    
    df['Text_Processed'] = df['Text_Processed'].str.translate(str.maketrans('', '', string.punctuation))
    
    # Remove digits
    df['Text_Processed'] = df['Text_Processed'].replace(r'\d+', '', regex=True)
    
    # Expand contractions and remove stopwords
    stop_words = set(stopwords.words('english'))
    def expand_and_remove_stopwords(text):
        expanded_text = contractions.fix(text)
        words = expanded_text.split()
        return ' '.join([word for word in words if word.lower() not in stop_words])
    
    df['Text_Processed'] = df['Text_Processed'].apply(expand_and_remove_stopwords)
    
    # Tokenization and Lemmatization
    def tokenize_and_lemmatize(text):
        tokens = word_tokenize(text)
        return ' '.join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens])
    
    df['Text_Processed'] = df['Text_Processed'].apply(tokenize_and_lemmatize)
    
    # Save to new CSV file
    df.to_csv('test_processed2.csv', index=False)
    return df

test_df = preprocess_text(test_df)