In [1]:
import pandas as pd
from tqdm import tqdm
import time
import re
import random

data = pd.read_csv('..\\..\\Data\\raw\\twcs\\twcs.csv')
# Set the number of unique users. Note that this doesn't mean number of rows. It means number of unique users that will be used to create intents. If -1 then all users will be used.
unique_user_count = 20

In [2]:
# Fill NaN values with -1
data["in_response_to_tweet_id"] = data["in_response_to_tweet_id"].fillna(-1).astype(int) 
data["response_tweet_id"] = data["response_tweet_id"].fillna(-1)  

# Get the unique users that are inbound
unique_users = data[data["inbound"] == True]["author_id"].unique()
# # Set the number of unique users. Note that this doesn't mean number of rows. It means number of unique users that will be used to create intents. If -1 then all users will be used.
# unique_users = unique_users[:unique_user_count] if unique_user_count != -1 else unique_users
# Get random unique users
unique_users = random.sample(list(unique_users), unique_user_count) if unique_user_count != -1 else unique_users

In [3]:
def extract_responses(response_ids):
    """
    Extract response IDs as a list of integers.
    """
    try:
        return [int(response_ids)]
    except ValueError:
        return [int(x) for x in response_ids.split(",")]

def extract_conversation(conv, response_num, data, comp_name=None):
    """
    Recursively extract a conversation given a response ID.
    """
    if response_num == -1:
        return conv, comp_name

    row = data[data["tweet_id"] == response_num]
    if row.empty:
        return conv, comp_name

    conv += "\n"
    inbound_val = row["inbound"].values[0]

    if inbound_val:
        conv += "Customer: "
    else:
        conv += "Company: "
        if comp_name is None:
            comp_name = row["author_id"].values[0]

    conv += row["text"].values[0]

    responses = extract_responses(row["response_tweet_id"].values[0])
    for response in responses:
        conv, comp_name = extract_conversation(conv, response, data, comp_name)

    return conv, comp_name


In [4]:
def process_conversations(data, unique_users):
    """
    Process all conversations and create a DataFrame of user IDs, conversations, and company names.
    """
    all_conversations = []
    all_company_names = []
    user_based_convs = []

    for user_id in tqdm(unique_users):
        user_conversations = []
        user_requests = data[(data["author_id"] == user_id) & (data["in_response_to_tweet_id"] == -1)]

        for _, user_request in user_requests.iterrows():
            conv = f"Customer: {user_request['text']}"
            responses = extract_responses(user_request["response_tweet_id"])

            for response in responses:
                convers, comp_name = extract_conversation(conv, response, data)
                all_conversations.append(convers)
                all_company_names.append(comp_name)
                user_conversations.append(convers)

        user_based_convs.append(user_conversations)

    user_id_list = [user_id for user_id, convs in zip(unique_users, user_based_convs) for _ in convs]

    return pd.DataFrame({
        "user_id": user_id_list,
        "conversations": all_conversations,
        "company_name": all_company_names
    })


In [5]:
# Process conversations to generate a structured DataFrame
sample_data = process_conversations(data, unique_users)

100%|██████████| 20/20 [00:01<00:00, 11.49it/s]


In [6]:
def find_subsets(df):
    """
    Find all subset conversations within the same user and company group.
    """
    subset_records = []
    grouped = df.groupby(['user_id', 'company_name'])
    
    for (user_id, company_name), group in grouped:
        conversations = group['conversations'].tolist()
        for i, convo1 in enumerate(conversations):
            for j, convo2 in enumerate(conversations):
                if i != j and convo1 in convo2:
                    subset_records.append((user_id, company_name, convo1, convo2))
    
    return pd.DataFrame(subset_records, columns=['user_id', 'company_name', 'subset_conversation', 'parent_conversation'])

def drop_subset_conversations(df):
    """
    Drop subset conversations from the DataFrame.
    """
    subset_df = find_subsets(df)
    subset_conversations = subset_df['subset_conversation'].unique()
    cleaned_df = df[~df['conversations'].isin(subset_conversations)]
    return cleaned_df


In [7]:
# Identify and drop subset conversations
sample_data = drop_subset_conversations(sample_data)

In [8]:
def clean_text(text):
    """
    Clean text by removing mentions, URLs, and special characters.
    """
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s\n]', '', text)  # Remove special characters but keep newline characters
    text = re.sub(r'[ \t]+', ' ', text).strip()  # Remove extra spaces but preserve newlines
    return text

# Apply the cleaning function to the conversations column
sample_data['cleaned_conversations'] = sample_data['conversations'].apply(clean_text)


In [9]:
def valid_conversation(df):
    """
    Validate conversations by ensuring 'Customer' and 'Company' roles exist
    and the company name is not missing.
    """
    invalid_inds = []
    texts = df["cleaned_conversations"].values
    company_names = df["company_name"].values

    for i in range(len(df)):
        txt = texts[i]
        comp_name = company_names[i]
        if "Customer" not in txt or "Company" not in txt or comp_name is None:
            invalid_inds.append(i)
    df.drop(invalid_inds, inplace=True)
    
    return df.reset_index(drop=True)

# Apply validation
sample_data = valid_conversation(sample_data)

In [10]:
def to_structured(txt, comp_name):
    """
    Convert conversation text into a structured format with role and message pairs.
    """
    customer_start = [m.start() for m in re.finditer("Customer", txt)]
    company_start = [m.start() for m in re.finditer("Company", txt)]
    CUSTOMER_LEN = 8
    COMPANY_LEN = 7
    start_pairs = list(zip(customer_start, company_start))
    structured = [{"Company_name": comp_name}]
    messages = {"conversation": []}

    for i in range(len(start_pairs)):
        role = txt[start_pairs[i][0]:start_pairs[i][0] + CUSTOMER_LEN]
        msg = txt[start_pairs[i][0] + CUSTOMER_LEN:start_pairs[i][1]]

        role_company = txt[start_pairs[i][1]:start_pairs[i][1] + COMPANY_LEN]
        msg_company = (
            txt[start_pairs[i][1] + COMPANY_LEN:start_pairs[i + 1][0]]
            if i + 1 < len(start_pairs)
            else txt[start_pairs[i][1] + COMPANY_LEN:]
        )

        msg = msg.replace("\n", "")
        msg_company = msg_company.replace("\n", "")

        messages["conversation"].append({"role": role, "message": msg.strip()})
        messages["conversation"].append({"role": role_company, "message": msg_company.strip()})

    structured.append(messages)
    return structured

# Apply the structured conversion
sample_data['structured_conversations'] = sample_data.apply(
    lambda x: to_structured(x['cleaned_conversations'], x['company_name']), axis=1
)

In [11]:
sample_data

Unnamed: 0,user_id,conversations,company_name,cleaned_conversations,structured_conversations
0,208663,Customer: .@115940 is Adding a Dynamic Guide t...,hulu_support,Customer is Adding a Dynamic Guide to Their Li...,"[{'Company_name': 'hulu_support'}, {'conversat..."
1,208663,Customer: .@115940 Adds 9 New FOX Affiliates N...,hulu_support,Customer Adds 9 New FOX Affiliates Now Has Mor...,"[{'Company_name': 'hulu_support'}, {'conversat..."
2,161287,Customer: Getting McDonald’s for breakfast 😋 @...,McDonalds,Customer Getting McDonalds for breakfast \nCom...,"[{'Company_name': 'McDonalds'}, {'conversation..."
3,237701,Customer: @116062 my daughter birthday is Sund...,AskTarget,Customer my daughter birthday is Sunday do you...,"[{'Company_name': 'AskTarget'}, {'conversation..."
4,237701,Customer: @115858 fix this Damn ios11 or take ...,AppleSupport,Customer fix this Damn ios11 or take this shit...,"[{'Company_name': 'AppleSupport'}, {'conversat..."
5,343422,Customer: @122172 @ATVIAssist y I'm I no longe...,ATVIAssist,Customer y Im I no longer receiving keys The p...,"[{'Company_name': 'ATVIAssist'}, {'conversatio..."
6,380403,Customer: @118625 why stop serving breakfast @...,TacoBellTeam,Customer why stop serving breakfast Milltown N...,"[{'Company_name': 'TacoBellTeam'}, {'conversat..."
7,277202,"Customer: @116062 in Bristow VA, I expect bett...",AskTarget,Customer in Bristow VA I expect better of you ...,"[{'Company_name': 'AskTarget'}, {'conversation..."
8,226343,Customer: @comcastcares my DVR recording someh...,comcastcares,Customer my DVR recording somehow of tonights ...,"[{'Company_name': 'comcastcares'}, {'conversat..."
9,619562,Customer: Had to write out my @ChipotleTweets ...,ChipotleTweets,Customer Had to write out my order for my sist...,"[{'Company_name': 'ChipotleTweets'}, {'convers..."


In [12]:
# Save the structured data also add timestamp to the file name
sample_data.to_excel(f'..\\..\\Data\\processed\\sample\\twcs_structured_UniqueCount-{unique_user_count}_time-{time.strftime("%Y%m%d-%H%M")}.xlsx', index=False)