In [1]:

import os
import re
import pandas as pd  # Optional (for CSV), safe to keep

# ---------- File paths (fixed name) ----------
file_path_text = 'NoisyText.txt'        # <-- fix the typo
file_path_csv  = 'vdolinks.csv'         # optional (if you have mappings)
output_path    = 'Cleaned_Comments_Output.txt'

# ---------- Safety checks ----------
if not os.path.exists(file_path_text):
    raise FileNotFoundError(f"Could not find {file_path_text}. "
                            "Make sure the name matches exactly (case-sensitive).")

# ---------- Load raw text safely (handles emojis/unicode) ----------
with open(file_path_text, 'r', encoding='utf-8', errors='ignore') as f:
    raw = f.read()

# ---------- Build regex patterns based on what’s in the file ----------
# Notes: These patterns are derived from the noisy markers and lines visible in the uploaded file.
noise_patterns = [
    r'^NewMovieDrPQRd\s*$',                         # marker lines
    r'^[A-Za-z0-9_-]{6,12}\s*$',                    # bare YouTube-like IDs (e.g., K26_sDKnvMU)
    r'^<HttpError.*$',                              # API error lines
    r"^'charmap' codec.*$",                         # embedded encoding error messages
    r"^'likeCount'\s*$",                            # stray likeCount literal
    r'^(sd|hd)\s*$',                                # resolution tokens
    r'^PT\d+M\d+S\s*$',                             # ISO 8601 durations (PT1M52S)
    r'^\d{1,9}( \d{1,9}){0,4}\s*$',                 # bare numeric stat lines (counts)
    r'^Trailer.*$',
    r'^Director:.*$',
    r'^Cast:.*$',
    r'^Plot:.*$',
    r'^Rating:.*$',
    r'^Studio:.*$',
    r'^Official Content From.*$',
    r'^Subscribe.*$',
    r'^Follow .*$',                                 # social prompts
    r'^Like us on .*$',                             # social prompts
    r'^Visit .*$',                                  # links/promos
    r'^http[s]?://\S+\s*$',                         # raw links
    r'^/watch\?v=\S+\s*$',                          # raw YouTube watch fragments
    r'^\(c\) \d{4} .*$',                            # copyright lines
    r'^\s*$',                                       # empty lines (we’ll also strip later)
]

# Combine into one big regex (multi-line matching)
noise_re = re.compile('|'.join(f'(?:{p})' for p in noise_patterns), re.IGNORECASE | re.MULTILINE)

# ---------- Remove lines that match noise patterns ----------
# We’ll process line-by-line to keep legitimate comments intact.
clean_lines = []
for line in raw.splitlines():
    line = line.strip()

    # Skip obvious noise
    if noise_re.match(line):
        continue

    # Skip heavy metadata fragments frequently seen in the file (examples):
    if line.startswith("A preview for this 90s disney movie"):  # sample description lines
        continue

    # Optional: remove pure timestamps like "0:54" or "1:43"
    if re.fullmatch(r'\d{1,2}:\d{2}', line):
        continue

    # Optional: drop lines that are almost entirely non-word symbols
    if re.fullmatch(r'[\W_]{1,}$', line):
        continue

    # Optional: drop isolated locale tags like 'sd', 'hd' if any slipped through
    if line.lower() in {'sd', 'hd'}:
        continue

    # If the line still looks like a real comment, keep it
    if line:
        clean_lines.append(line)

# ---------- Normalize spacing / de-duplicate adjacent repeats ----------
# Remove duplicate consecutive lines while preserving order
deduped = []
prev = None
for line in clean_lines:
    if line != prev:
        deduped.append(line)
    prev = line

import re

slang_dict = {
    "N-words": "(**)",
    "dam-words": "(**)",
    "f-words"; "(**)"
}

def replace_slang(text):
    for slang, replacement in slang_dict.items():
        text = re.sub(rf"\b{slang}\b", replacement, text, flags= re.IGNORECASE)
    return text

def parse_data(data):
    #Split the raw text into movie sections
    sections = data.split("NewMovieDrPQRd")
    parsed = []

for section in sections:
    section = section.strip()
    if not section:
        continue

movie_id_match = re.search(r"MovieID\s*:\s*(\d+)", section)
movie_id = movie_id_watch.group(1) if movie_id_match else None

comments_match = re.search(r"Comments\s*:(.*)", section, flags = re.DOTALL)
comments = comments_match.group(1).strip() if comments_
    
# Join with single newlines
clean_text = '\n'.join(deduped).strip()

# ---------- (Optional) If you want to use the CSV for anything ----------
# For example: if vdolinks.csv maps video IDs to titles, you might enrich later processing.
if os.path.exists(file_path_csv):
    try:
        links_df = pd.read_csv(file_path_csv)
        # Example: print columns so you know what’s inside
        print("Loaded vdolinks.csv with columns:", list(links_df.columns))
    except Exception as e:
        print("Could not read vdolinks.csv:", e)

# ---------- Save output ----------
with open(output_path, 'w', encoding='utf-8', errors='ignore') as f:
    f.write(clean_text)

print(f"Done. Wrote {len(deduped):,} cleaned lines to -> {output_path}")


Loaded vdolinks.csv with columns: ['youtubeId', 'movieId', 'title']
Done. Wrote 32,708 cleaned lines to -> Cleaned_Comments_Output.txt
