In [None]:
'''
This script reads a CSV of Twitter customer service interactions,
             filters relevant columns, separates customer and agent messages,
             merges them into question-answer pairs, cleans the text, and saves
             the result as a CSV for FAQ purposes.
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
'''

import pandas as pd
import re

df = pd.read_csv("twcs.csv", low_memory=False)

# Keep relevant columns
df = df[["tweet_id", "author_id", "inbound", "response_tweet_id", "text"]]

# Convert IDs to string for merging
df["tweet_id"] = df["tweet_id"].astype(str)
df["response_tweet_id"] = df["response_tweet_id"].astype(str)

# Separate customer and agent messages
customer_msgs = df[df["inbound"] == True]
agent_msgs = df[df["inbound"] == False]

# Merge customer queries with corresponding agent responses
faq_pairs = customer_msgs.merge(
    agent_msgs,
    how="inner",
    left_on="response_tweet_id",
    right_on="tweet_id",
    suffixes=("_customer", "_agent")
)

# Keep only text columns
faq_pairs = faq_pairs[["text_customer", "text_agent"]].dropna()

# ---- CLEAN TEXT ----
def clean_text(text):
    text = re.sub(r"@\w+", "", text)         # remove mentions
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"\s+", " ", text)         # collapse multiple spaces
    return text.strip()

#Apply clean_text to customer and agent messages
faq_pairs["question"] = faq_pairs["text_customer"].apply(clean_text)
faq_pairs["answer"] = faq_pairs["text_agent"].apply(clean_text)

#keep only cleaned question-answer pairs
faq_pairs = faq_pairs[["question", "answer"]]

#save the cleaned FAQ dataset 
faq_pairs.to_csv("faq_auto.csv", index=False)
print("Clean FAQ dataset saved:", len(faq_pairs), "pairs")
print(faq_pairs.sample(5))


Clean FAQ dataset saved: 1016105 pairs
                                                 question  \
696839  The only number that appears is a fax in Chile...   
639630  I need some immediate help, please, with docum...   
95263   BA0455 man on crutches in exit aisle, 9F. Safe...   
69234   yo website ain’t working can you please take a...   
556414      when we gonna fix this I️ I️ I️ issue? 🤷🏽‍♀️🙄   

                                                   answer  
696839  This is the telephone number for enquires that...  
639630  Hey Jaime, to further investigate please write...  
95263   Hi Paul, did you discuss this with the crew on...  
69234   We're sorry that you're having an issue! Pleas...  
556414  Here’s what you can do to work around the issu...  


# Download sentence-transformers library

In [2]:
!pip install sentence-transformers




[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# AUTO FAQ Embedding Generation

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

#Load pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

answers = faq_pairs["answer"].tolist()
embeddings = []

batch_size = 128  # Batch processing to avoid memory issues
for i in range(0, len(answers), batch_size): 
    batch = answers[i:i+batch_size]
    emb = model.encode(batch, convert_to_numpy=True)
    embeddings.append(emb)

embeddings = np.vstack(embeddings) #combine all batches into a single Numpy array
np.save("answer_embeddings.npy", embeddings) #save it