In [42]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data files (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the CSV and Parquet files into pandas DataFrames
df_emojipedia = pd.read_csv('../data/emojipedia.csv')
df_openmoji = pd.read_csv('../data/openmoji.csv')
df_llemoji = pd.read_parquet('../data/llmemoji.parquet')

# Remove "U+" prefix from 'Codepoints Hex' in df_emojipedia and rename the column to 'unicode'
df_emojipedia['unicode'] = df_emojipedia['Codepoints Hex'].str.replace('U+', '', regex=False)

# Remove "U+" prefix from 'unicode' in df_llemoji
df_llemoji['unicode'] = df_llemoji['unicode'].str.replace('U+', '', regex=False)

# Rename 'hexcode' column to 'unicode' in df_openmoji
#remove unicode column from openmoji
df_openmoji.drop(columns=['unicode'], inplace=True)
df_openmoji.rename(columns={'hexcode': 'unicode'}, inplace=True)

# Convert 'unicode' columns to string type
df_emojipedia['unicode'] = df_emojipedia['unicode'].astype(str)
df_openmoji['unicode'] = df_openmoji['unicode'].astype(str)
df_llemoji['unicode'] = df_llemoji['unicode'].astype(str)

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
    else:
        text = ''
    return text

# Function to preprocess text
def preprocess_text(text):
    # Clean the text
    text = clean_text(text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply the preprocessing function to the 'Description' column in df_emojipedia
df_emojipedia['cleaned_description'] = df_emojipedia['Description'].apply(preprocess_text)

# Filter out rows with empty 'openmoji_tags' in df_openmoji
df_openmoji = df_openmoji[df_openmoji['openmoji_tags'].notna()]

# Apply the preprocessing function to the 'openmoji_tags' column in df_openmoji
df_openmoji['cleaned_openmoji_tags'] = df_openmoji['openmoji_tags'].apply(preprocess_text)

# Apply the preprocessing function to the 'LLM description' column in df_llemoji
df_llemoji['cleaned_llm_description'] = df_llemoji['LLM description'].apply(preprocess_text)

final_df = pd.merge(df_emojipedia, df_llemoji, on='unicode', how='outer')
final_df = pd.merge(final_df, df_openmoji, on='unicode', how='outer')

# Save the final DataFrame to a csv file
#Select only the columns that are needed
final_df = final_df[['unicode', 'cleaned_openmoji_tags', 'cleaned_description', 'cleaned_llm_description']]
final_df.to_csv('../data/prerprocessedtext_emoji.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rajprasadshrestha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
