In [None]:
import pandas as pd

# Load the dataset (replace 'your_dataset.csv' with your file path)
data = pd.read_csv('..\\Data\\raw\\twcs\\twcs.csv')

# Select first 100 unique users who are customers (inbound = True)
unique_users = data[data["inbound"] == True]["author_id"].unique()[:10000]

In [7]:
import pandas as pd
from tqdm import tqdm

# Preprocess the data
data["in_response_to_tweet_id"] = data["in_response_to_tweet_id"].fillna(-1).astype(int)
data["response_tweet_id"] = data["response_tweet_id"].fillna("-1")

# Helper function to extract responses
def extract_responses(response_ids):
    try:
        return [int(response_ids)]
    except:
        return [int(x) for x in response_ids.split(",")]

# Recursive function to extract the conversation
def extract_conversation(conv, response_num, data, comp_name):
    if response_num == -1:
        # End of the conversation
        return conv, comp_name
    else:
        row = data[data["tweet_id"] == response_num]
        if len(row) == 0:
            # End of the conversation
            return conv, comp_name

        conv += "\n"
        inbound_val = row["inbound"].values[0]
        if inbound_val:
            conv += "Customer: "
        else:
            conv += "Company: "
            if comp_name is None:
                comp_name = row["author_id"].values[0]
        conv += row["text"].values[0]

        # Process responses recursively
        responses = extract_responses(row["response_tweet_id"].values[0])
        for i in responses:
            conv, comp_name = extract_conversation(conv, i, data, comp_name)
        return conv, comp_name

# Initialize lists for storing results
all_conversations = []
all_company_names = []
user_based_convs = []

# Extract user conversations
for userid in tqdm(unique_users):
    user_conversations = []
    user_requests = data[(data["author_id"] == userid) & (data["in_response_to_tweet_id"] == -1)]
    for i in range(len(user_requests)):
        conv = "Customer: " + user_requests.iloc[i]["text"]
        responses = extract_responses(user_requests.iloc[i]["response_tweet_id"])
        for response in responses:
            convers, comp_name = extract_conversation(conv, response, data, comp_name=None)
            all_conversations.append(convers)
            all_company_names.append(comp_name)
            user_conversations.append(convers)
    user_based_convs.append(user_conversations)

# Map user IDs to conversations
user_id_list = []
for i in range(len(user_based_convs)):
    user_id_list += [unique_users[i]] * len(user_based_convs[i])

# Create a DataFrame for conversations
df = pd.DataFrame({"user_id": user_id_list, "conversations": all_conversations, "company_name": all_company_names})

# Function to print conversation history for a specific user
def print_conv_History(user_id):
    conversations = df[df["user_id"] == user_id]["conversations"].values
    for i in conversations:
        print(i)
        print("\n")

# Example: Print conversation history for a specific user ID
print_conv_History("115767")

100%|██████████| 100/100 [00:09<00:00, 10.17it/s]


Customer: Any help here @AdobeCare? https://t.co/x50e57UG4E
Company: @115767 Hi Jason, please let us know if there is anything we can do to help. Thanks! ^AJ https://t.co/iuwZCjz4Or
Customer: @AdobeCare Pretty much explained my issue in the quoted tweet... dragging an image onto a canvas no longer center snaps it...
Company: @115767 Hi Jason, could you please try resetting the preferences of Photoshop &amp; let us know if it helps with the issue https://t.co/j3Dj2HDknM   ^SC
Customer: @AdobeCare the ctrl+alt+shift on load method didn't work. (2/2)
Company: @115767 Sorry that this isn't more simple... let us know if the steps in the video work out or if you need further help ^Madison
Customer: @AdobeCare Didn't work. Tried it on another computer, fresh install, same thing...
Company: @115767 Would you please DM the Adobe Product &amp; your purchase details that you have so that we can ask our experts to follow up. ^Raj https://t.co/iuwZCjz4Or
Customer: @AdobeCare sure thing
Customer: @A

In [None]:
df.to_excel("..\\Data/processed_conversations_10k.xlsx",index=False)

---

In [None]:
sample_data = df

In [None]:
# Step 1: find all subset conversations that are present in the parent conversation and drop them
def find_subsets(df):
    subset_records = []
    grouped = df.groupby(['user_id', 'company_name'])
    
    for (user_id, company_name), group in grouped:
        conversations = group['conversations'].tolist()
        for i, convo1 in enumerate(conversations):
            for j, convo2 in enumerate(conversations):
                if i != j and convo1 in convo2:
                    subset_records.append((user_id, company_name, convo1, convo2))
    
    return pd.DataFrame(subset_records, columns=['user_id', 'company_name', 'subset_conversation', 'parent_conversation'])

# Identify subset conversations
subset_df = find_subsets(sample_data)

# Function to drop subset conversations
def drop_subset_conversations(df):
    subset_df = find_subsets(df)
    subset_conversations = subset_df['subset_conversation'].unique()
    cleaned_df = df[~df['conversations'].isin(subset_conversations)]
    return cleaned_df

# Drop subset conversations from the original dataset
sample_data = drop_subset_conversations(sample_data)

In [None]:
# Step 2: Correct spelling
import re
from spellchecker import SpellChecker

# Function to clean the text
def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s\n]', '', text)  # Remove special characters but keep newline characters
    text = re.sub(r'[ \t]+', ' ', text).strip()  # Remove extra spaces but preserve newlines
    return text

# Apply the cleaning function to the conversations column
sample_data['cleaned_conversations'] = sample_data['conversations'].apply(clean_text)

# Initialize the spell checker
spell = SpellChecker()

# Function to correct misspelled words
def correct_mistypo(text):
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)  # Suggest the most likely correction
        corrected_text.append(corrected_word if corrected_word else word)
    return ' '.join(corrected_text)

# Apply the correction function to the cleaned conversations
sample_data['corrected_conversations'] = sample_data['cleaned_conversations'].apply(correct_mistypo)

sample_data.head()
