In [None]:
# Import necessary libraries

import os
import sys
import pandas as pd
import numpy as np
import re

In [None]:
# Load the dataset from the specified directory

DATA_RAW_PATH = os.path.join(os.getcwd(), '../data/raw')
DATA_INTERIM_PATH = os.path.join(os.getcwd(), '../data/interim')

conversations_df = pd.read_csv(os.path.join(os.getcwd(), DATA_RAW_PATH, 'movie_conversations.tsv'), sep='\t', header=None, on_bad_lines='skip')
conversations_df.columns = ["char_id1", "char_id2", "movie_id", "frase_ids"]

lines_df = pd.read_csv(os.path.join(os.getcwd(), DATA_RAW_PATH, 'movie_lines.tsv'), sep='\t', header=None, on_bad_lines='skip')
lines_df.columns = ["line_id", "char_id", "movie_id", "char_name", "text"]

In [None]:
# Function to split any row into overlapping pairs of frase_ids

def split_frase_ids(df):
    new_rows = []
    for _, row in df.iterrows():
        # Clean and split the frase_ids string
        frase_ids = row['frase_ids'].replace("'", "").replace("[", "").replace("]", "").split()
        if len(frase_ids) >= 2:
            for i in range(len(frase_ids) - 1):
                row_copy = row.copy()
                row_copy['frase_ids'] = str([frase_ids[i], frase_ids[i+1]])
                new_rows.append(row_copy)
        else:
            new_rows.append(row)
    return pd.DataFrame(new_rows)

conversations_df = split_frase_ids(conversations_df)

In [None]:
# Now we can merge the conversations_df with lines_df to get the text for each frase_id

conversations_df[['frase_id1', 'frase_id2']] = conversations_df['frase_ids'].apply(lambda x: pd.Series(eval(x)))
conversations_df.drop(columns=['frase_ids'], inplace=True)

merged_df = conversations_df.merge(
    lines_df[['line_id', 'text']].rename(columns={'line_id': 'frase_id1', 'text': 'text1'}),
    on='frase_id1', how='left'
).merge(
    lines_df[['line_id', 'text']].rename(columns={'line_id': 'frase_id2', 'text': 'text2'}),
    on='frase_id2', how='left'
)

merged_df = merged_df[['text1','text2']]

merged_df.dropna(inplace=True)

In [None]:
# Function to normalize text by converting to lowercase, removing digits, punctuation, and extra whitespace

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Remove leading and trailing whitespace
    return text

merged_df["text1"] = merged_df["text1"].apply(normalize_text)
merged_df["text2"] = merged_df["text2"].apply(normalize_text)

In [None]:
# Save the processed DataFrame to a CSV file

merged_df.to_csv(os.path.join(os.getcwd(), DATA_INTERIM_PATH, 'conversations_prepared.csv'), index=False)