In [6]:
# 1) Import necessary libraries
import pandas as pd
import re

# 2) Define File Paths
# It's good practice to store file names in variables for easy modification.
file_path_text = 'NoisyText.txt'            # raw noisy text file
file_path_csv  = 'vdoLinks.csv'           # CSV containing at least: youtubeId, title
output_path    = 'Cleaned_Comments_Output.txt'

# 3) Load Data
# We load the movie links from the CSV file into a pandas DataFrame.
movie_list = pd.read_csv(file_path_csv)

# We load the raw, noisy text file. The 'latin-1' encoding helps avoid codec errors
# if the text has odd characters.
with open(file_path_text, 'r', encoding='latin-1') as f:
    raw_data = f.read()

# 4) Calculate Initial Noise Metrics
# We calculate the total number of letters in the original file.
# The regex [A-Za-z] matches all uppercase and lowercase letters.
letters_before = len(re.findall(r'[A-Za-z]', raw_data))

# 5) Define Slang and Replacement Logic
# The lab requires replacing slang words with "(**)" A dictionary.
# is a perfect way to store these word replacement pairs
slang_dict = {
    "lol":  "(**)", "omg":  "(**)", "wtf": "(**)",
    "idk": "(**)", "btw": "(**)", "lmao": "(**)"
}

#This function uses a regular expression to perform case-insensitive
#replacement of slang words. The '\b' ensures we only match whole words
# and not parts of other words (e.g., 'lol' in lollopop').
def replace_slang(text):
    for slang, replacement in slang_dict.items():
        # dynamic, safe pattern for each slang word
        text = re.sub(rf"\b{slang})\b", replacement, text, flags=re.IGNORECASE)
    return text

# 6) Core Parsing and Cleaning Function
def parse_data(data):
    # IMPORTANT: Replace this delimiter if your file uses a different one.
    sections = data.split('NewMovieDrPQRd')
    parsed = []

    # Loop through each section to extract the movie ID and comments.
    for section in sections:
        # Split each section into individual lines.
        lines = section.strip().split('\n')

        # Skip empty sections
        if not lines or len(lines[0].strip()) == 0:
            continue

        # The first line of each valid section is the YouTube ID.
        movie_id = lines[0].strip()
        comments = []

        # Iterate through the rest of the lines to extract comments.
        for line in lines[1:]:
            line = line.strip()

            # Skip lines that are either empty or a repeat of the movie ID.
            if not line or line == movie_id:
                continue

            # Skip lines that are identified as noisy text (URLs, system messages, etc.)
            if line.startswith('<') or line.startswith("'"):
                continue
            if any(skip in line.lower() for skip in [
                'http error', 'charmap', 'pt1m', '128238',
                'video has disabled comments'
            ]):
                continue

            # Apply slang replacement to each valid comment line.
            line = replace_slang(line)

            # Keep the cleaned line
            comments.append(line)

        # If no valid comments were found for a movie, add a specific message.
        if not comments:
            comments = ['No comments were found']

        # Create a dictionary for each movie and add it to our parsed list.
        parsed.append({'youtubeId': movie_id, 'comments': comments})

    # Convert list of dicts into a DataFrame for easy merging and outputting.
    return pd.DataFrame(parsed)

# 7) Execute the Cleaning
parsed_df = parse_data(raw_data)

# 8) Merge the cleaned comments DataFrame with our movie list DataFrame
#    to get the movie titles for each YouTube ID.
#    Ensure movie_list has the columns: 'youtubeId' and 'title'
merged_df = pd.merge(parsed_df, movie_list, on='youtubeId', how='left')

# 9) Write the Output
# We write the cleaned output, formatted as requested.
with open(output_path, 'w', encoding='utf-8') as f:
    for _, row in merged_df.iterrows():
        # Movie title (falls back to the ID if title is missing)
        #movie_title = row['title'] if ('title' in row and pd.notna(row['title'])) else row['youtubeId']
        f.write(f"Movie Name: {row['title']}\n")
        if row['Comments'] == ['No comments were found']:
            f.write("No comments were found\n")
        else:
            f.write("The comments are:\n")
            for comment in row['comments']:
                f.write(comment + '\n')
            f.write('\n')  # blank line between movie sections

# 10) Calculate and Display Cleaning Efficiency
# After writing the cleaned text, we read it back to count the letters.
with open(output_path, 'r', encoding='utf-8') as f:
    cleaned_text = f.read()

# Count letters in the cleaned text.
letters_after = len(re.findall(r'[A-Za-z]', cleaned_text))

# We calculate the ratio of letters after cleaning to letters before cleaning.
# A ratio less than one indicates that noisy text was successfully removed.
ratio = (letters_after / letters_before)
print('Letters Before:', letters_before)
print('Letters After :', letters_after)
print('Cleaning Ratio:', round(ratio, 3))

error: unbalanced parenthesis at position 5