# Important Libraries

In [56]:
import os
import pandas as pd 

# **Step 1: Load and Parse movie_lines.txt**

In [57]:
# Creating a dictionary to map line IDs to text
movie_lines_path = "movie_lines.txt"

# A Dictionary to hold lineID -> text
id2line = {}

with open(movie_lines_path, encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, _, _, _, text = parts
            id2line[line_id] = text

# Step 2: Load and Parse movie_conversations.txt

In [58]:
# Extract conversations (list of line IDs)
movie_conversations_path = "movie_conversations.txt"

conversations = []

with open(movie_conversations_path, encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            line_ids_str = parts[3]
            # Convert string to actual list of line IDs
            line_ids = eval(line_ids_str)  # Safe here because the format is consistent
            conversations.append(line_ids)

# Step 3: Reconstruct Conversations from Line IDs

In [59]:
# Step 3: Create a list of reconstructed conversations
reconstructed_conversations = []

for conv in conversations:
    dialogue = []
    for line_id in conv:
        line_text = id2line.get(line_id, "")
        dialogue.append(line_text)
    reconstructed_conversations.append(dialogue)


# Step 4: View Sample Conversations

In [60]:
# Print the first 3 conversations
for i, convo in enumerate(reconstructed_conversations[:3]):
    print(f"\nConversation {i+1}")
    for turn in convo:
        print(turn)



Conversation 1
Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.
Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

Conversation 2
You're asking me out.  That's so cute. What's your name again?
Forget it.

Conversation 3
No, no, it's my fault -- we didn't have a proper introduction ---
Cameron.
The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
Seems like she could get a date easy enough...


# **Final: Saving to Text File**

In [61]:
# Print the first 3 conversations
for i, convo in enumerate(reconstructed_conversations[:3]):
    print(f"\nConversation {i+1}")
    for turn in convo:
        print(turn)



Conversation 1
Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.
Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

Conversation 2
You're asking me out.  That's so cute. What's your name again?
Forget it.

Conversation 3
No, no, it's my fault -- we didn't have a proper introduction ---
Cameron.
The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
Seems like she could get a date easy enough...


## **Summary**

- id2line maps line IDs to actual dialogue text.

- conversations holds the sequences of line IDs per conversation.

- reconstructed_conversations gives you the full text of the conversations.

# Setup: Parse & Reconstruct (from earlier)

In [62]:
# Parse movie_lines.txt
id2line = {}
with open("movie_lines.txt", encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, _, _, _, text = parts
            id2line[line_id] = text

# Parse movie_conversations.txt
conversations = []
with open("movie_conversations.txt", encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            line_ids = eval(parts[3])
            conversations.append(line_ids)


# **Option A: One Row per Conversation (block of text)**

In [None]:
# Combine each conversation into one string
conversation_texts = [
    "\n".join([id2line.get(line_id, "") for line_id in conv])
    for conv in conversations
]

# Create DataFrame
df_conversations = pd.DataFrame({
    "conversation_id": list(range(len(conversation_texts))),
    "conversation_text": conversation_texts
})

# Save to CSV
df_conversations.to_csv("conversations_block.csv", index=False)

# **Option B: One Row per Line (dialogue turn)**

In [63]:
rows = []

for conv_id, conv in enumerate(conversations):
    for order, line_id in enumerate(conv):
        line_text = id2line.get(line_id, "")
        rows.append({
            "conversation_id": conv_id,
            "line_order": order,
            "line_text": line_text
        })

df_lines = pd.DataFrame(rows)
df_lines.to_csv("conversations_lines.csv", index=False)


**Done!**

We now have:

- conversations_block.csv: full conversations as text blocks.

- conversations_lines.csv: every line of dialogue in its own row.

# Saving to `brian-output-repositories` Folder

In [65]:
# OS platform library
import os

# Step 1: Setup — Create Output Folder
output_folder = "brian-output-repositories"
os.makedirs(output_folder, exist_ok=True)

# Step 2: Load and Parse the Dataset Files
# Parse movie_lines.txt
id2line = {}
with open("movie_lines.txt", encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, _, _, _, text = parts
            id2line[line_id] = text

# Parse movie_conversations.txt
conversations = []
with open("movie_conversations.txt", encoding='utf-8', errors='ignore') as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            line_ids = eval(parts[3])
            conversations.append(line_ids)

# Step 3A: Create df_conversations (One row per conversation)
import pandas as pd

conversation_texts = [
    "\n".join([id2line.get(line_id, "") for line_id in conv])
    for conv in conversations
]

df_conversations = pd.DataFrame({
    "conversation_id": list(range(len(conversation_texts))),
    "conversation_text": conversation_texts
})

# Step 3B: Create df_lines (One row per line of dialogue)
rows = []

for conv_id, conv in enumerate(conversations):
    for order, line_id in enumerate(conv):
        line_text = id2line.get(line_id, "")
        rows.append({
            "conversation_id": conv_id,
            "line_order": order,
            "line_text": line_text
        })

df_lines = pd.DataFrame(rows)

# Step 4: Save Both Files into Folder
block_csv_path = os.path.join(output_folder, "conversations_block.csv")
df_conversations.to_csv(block_csv_path, index=False)

lines_csv_path = os.path.join(output_folder, "conversations_lines.csv")
df_lines.to_csv(lines_csv_path, index=False)


In [66]:
!ls

Capstone-Project---Codebyte-
README.md
README.txt
Youtube Data scrapping.ipynb
brian-analysis-file.ipynb
brian-output-repositories
chameleons.pdf
conversations_lines.csv
data-from-youtube.csv
index.ipynb
movie_characters_metadata.txt
movie_conversations.txt
movie_lines.txt
movie_titles_metadata.txt
raw_script_urls.txt
youtube_comments.csv
