In [9]:
import pandas as pd
sample_data = pd.read_excel("..\\Data\processed\processed_conversations_10k.xlsx")

In [10]:
# Step 1: find all subset conversations that are present in the parent conversation and drop them
def find_subsets(df):
    subset_records = []
    grouped = df.groupby(['user_id', 'company_name'])
    
    for (user_id, company_name), group in grouped:
        conversations = group['conversations'].tolist()
        for i, convo1 in enumerate(conversations):
            for j, convo2 in enumerate(conversations):
                if i != j and convo1 in convo2:
                    subset_records.append((user_id, company_name, convo1, convo2))
    
    return pd.DataFrame(subset_records, columns=['user_id', 'company_name', 'subset_conversation', 'parent_conversation'])

# Identify subset conversations
subset_df = find_subsets(sample_data)

# Function to drop subset conversations
def drop_subset_conversations(df):
    subset_df = find_subsets(df)
    subset_conversations = subset_df['subset_conversation'].unique()
    cleaned_df = df[~df['conversations'].isin(subset_conversations)]
    return cleaned_df

# Drop subset conversations from the original dataset
sample_data = drop_subset_conversations(sample_data)

In [11]:
# Step 2: Correct spelling
import re
from spellchecker import SpellChecker

# Function to clean the text
def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s\n]', '', text)  # Remove special characters but keep newline characters
    text = re.sub(r'[ \t]+', ' ', text).strip()  # Remove extra spaces but preserve newlines
    return text

# Apply the cleaning function to the conversations column
sample_data['cleaned_conversations'] = sample_data['conversations'].apply(clean_text)
# Save the cleaned data
sample_data.to_excel("..\\Data\processed\cleaned_conversations_10k_cleaned.xlsx", index=False)

In [None]:
# Initialize the spell checker
spell = SpellChecker()

# Function to correct misspelled words
def correct_mistypo(text):
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)  # Suggest the most likely correction
        corrected_text.append(corrected_word if corrected_word else word)
    return ' '.join(corrected_text)

# Apply the correction function to the cleaned conversations
sample_data['corrected_conversations'] = sample_data['cleaned_conversations'].apply(correct_mistypo)

# Save the cleaned data
sample_data.to_excel("..\\Data\processed\cleaned_conversations_10k_cleaned.xlsx", index=False)