# Data Preprocessing 

In [None]:
from collections import Counter
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

import torch.utils.data
import math
import torch.nn.functional as F
from tqdm import tqdm

#### Steps for Data Cleaning:
1. Load the raw text of movie conversations and lines.
2. Create a dictionary to map each line's ID to its text.
3. Remove punctuations and convert text to lowercase.
4. Create question-answer pairs.
5. Count word frequencies and build a vocabulary.
6. Encode the questions and answers using the vocabulary.

#### Function for preprocessing data :

In [None]:
movie_conversations_path = 'movie_conversations.txt'
movie_lines_path= 'movie_lines.txt'
max_sequence_length= 25

In [None]:
# Load a text corpus from a file and return as a list of lines
def load_corpus(file_path):
    with open(file_path, 'r') as f:
        return f.readlines()

# Create a dictionary mapping line IDs to their corresponding text
def create_line_dict(lines):
    line_dict = {}
    for line in lines:
        parts = line.split(" +++$+++ ")
        line_dict[parts[0]] = parts[-1]
    return line_dict

# Remove punctuations and convert text to lowercase
def clean_text(text):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    return ''.join(char.lower() for char in text if char not in punctuations)

# Create question-answer pairs from conversations
def create_qa_pairs(conversations, line_dict):
    qa_pairs = []
    for conversation in conversations:
        ids = eval(conversation.split(" +++$+++ ")[-1])
        for i in range(len(ids) - 1):
            question = clean_text(line_dict[ids[i]].strip())
            answer = clean_text(line_dict[ids[i+1]].strip())
            qa_pairs.append([question.split()[:max_sequence_length], answer.split()[:max_sequence_length]])
    return qa_pairs

# Encode reply text to integer values
def encode_reply(words, word_map, max_length=max_len):
    encoded = [word_map['<start>']]
    encoded += [word_map.get(word, word_map['<unk>']) for word in words]
    encoded.append(word_map['<end>'])
    padding_needed = max_length - len(encoded)
    encoded.extend([word_map['<pad>']] * padding_needed)
    return encoded

# Encode question text to integer values
def encode_question(words, word_map, max_length=max_len):
    encoded = [word_map.get(word, word_map['<unk>']) for word in words]
    padding_needed = max_length - len(encoded)
    encoded.extend([word_map['<pad>']] * padding_needed)
    return encoded


#### Data Cleaning Execution

In [None]:
conversations = load_corpus(movie_conversations_path)
lines = load_corpus(movie_lines_path)

# Create line dictionary
line_dict = create_line_dict(lines)

# Create question-answer pairs
qa_pairs = create_qa_pairs(conversations, line_dict)

# Count word frequencies and build vocabulary
word_frequency = Counter()
for pair in qa_pairs:
    word_frequency.update(pair[0])
    word_frequency.update(pair[1])

min_frequency = 5
vocab = [word for word, freq in word_frequency.items() if freq > min_frequency]
word_map = {word: idx + 1 for idx, word in enumerate(vocab)}
word_map.update({'<unk>': len(word_map) + 1, '<start>': len(word_map) + 2, '<end>': len(word_map) + 3, '<pad>': 0})

# Save word map
with open('WORDMAP_corpus.json', 'w') as json_file:
    json.dump(word_map, json_file)


    # Loop through each question-answer pair in the original 'pairs' list
pairs_encoded = []
for pair in qa_pairs:
    # Encode the question part of the pair using the 'encode_question' function
    qus = encode_question(pair[0], word_map)
    
    # Encode the answer part of the pair using the 'encode_reply' function
    ans = encode_reply(pair[1], word_map)
    
    # Append the encoded question and answer as a pair to 'pairs_encoded' list
    pairs_encoded.append([qus, ans])

# Save the encoded pairs to a JSON file for future use
with open('pairs_encoded.json', 'w') as p:
    json.dump(pairs_encoded, p)
