# Script to compare files inside folders from the Wayback Machine

This code compares each "type" of original document (files starting with capture_, external_links_, and others) downloaded from the WaybackMachine to look for duplicates, similarities, and difference. 
When it finds duplicates, it keeps only one document. If it finds documents that are not duplicates, it keeps all those that are not duplicates and see where the differences lay. 

## Workflow
1. Group files by type (capture_, external_links_, etc.) in each folder.
2. Read each file's content into memory.
3. Compute pairwise similarities within each group.
    - Initial scan: identify exact duplicates (hashing).
    - Second pass: run near-duplicate comparisons (SequenceMatcher) to identify cases not found by hashing.
4. Review differences for near-duplicates to decide relevance.
5. Keep only one representative file for sets of exact (or near-exact) duplicates.

In [None]:
pip! install langdetect

In [1]:
import os
import re
import hashlib
import shutil
import numpy as np
import pandas as pd
from collections import defaultdict, OrderedDict
from bs4 import BeautifulSoup
from difflib import SequenceMatcher


### 1. Organize File Paths
- Use os.walk() to iterate over the folder structure.
- Collect the paths of files grouped by their prefixes:
    `capture_*.html`
    `external_links_*.txt`, etc
- Ignore files prefixed with `diff_`.

In [2]:
orig_data_path = 'privacy-surveillance-tech/analysis/data/rawData/community_standards_archives'
derived_data_path = 'privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/'

In [None]:
# DE-COMMENT THIS CODE TO CREATE THE FOLDER STRUCTURE
"""
'''
Create the folder_groups dictionary where each key (folder) maps to a dict with two lists.
'''
folder_groups = {}  # Dictionary to hold folder names and their respective capture and external_links files
other_filenames = []  # Collect any files that are NOT diff_, capture_, or external_links_

# Walk through the directory tree
for root, dirs, files in os.walk(orig_data_path):
    # Get the current folder name from the root (e.g., "20110127")
    folder_name = os.path.basename(root)
    
    # Check if folder name starts with '2'
    if folder_name.startswith('2'):
        # Extract just the first 4 characters if needed
        current_folder = folder_name[:4]
    else:
        current_folder = folder_name

    # If there are files in the current folder, initialize an entry in folder_groups
    if files and current_folder not in folder_groups:
        folder_groups[current_folder] = {"capture": [], "external_links": []}
    
    # Process each file in the current folder
    for filename in files:
        full_path = os.path.join(root, filename)  # Define full_path to the file
        
        # Add to the capture list if it starts with capture_
        if filename.startswith('capture_'):
            # Store the full path (NOT just the filename)
            folder_groups[current_folder]["capture"].append(full_path)
        # Add to the capture list if it starts with 'external_links_'
        elif filename.startswith('external_links_'):
            folder_groups[current_folder]["external_links"].append(full_path)
        # Otherwise, if it's not diff_, capture_, or external_links_, add to other_filenames
        elif not (filename.startswith('diff_') or 
                  filename.startswith('capture_') or 
                  filename.startswith('external_links_')):
            other_filenames.append(full_path)

folder_groups = OrderedDict(sorted(folder_groups.items(), key=lambda item: item[0])) # Sort the folder_groups by folder names

# Sort the file lists inside the "capture" and "external_links" folders
for folder, data in folder_groups.items():
    data["capture"].sort()         # This sorts the list in ascending order (by file name)
    data["external_links"].sort()  # Similarly, sorts the external links list

"""

In [4]:
'''
# Debug printing the groups and other filenames to verify the structure
print("Folder Groups:")
for folder in folder_groups.keys():
#for folder in sorted(folder_groups.keys()):
    print(f"{folder}:")
    print("  capture:")
    for f in folder_groups[folder]["capture"]:
        print(f"    {f}")
    print("  external_links:")
    for f in folder_groups[folder]["external_links"]:
        print(f"    {f}")
'''
# print(folder_groups)

'\n# Debug printing the groups and other filenames to verify the structure\nprint("Folder Groups:")\nfor folder in folder_groups.keys():\n#for folder in sorted(folder_groups.keys()):\n    print(f"{folder}:")\n    print("  capture:")\n    for f in folder_groups[folder]["capture"]:\n        print(f"    {f}")\n    print("  external_links:")\n    for f in folder_groups[folder]["external_links"]:\n        print(f"    {f}")\n'

### 2. Read and Preprocess Files
- Read each file's content.
    - For HTML files (`capture_`):
        - Parse with BeautifulSoup to remove irrelevant HTML elements like IDs, scripts, and other dynamic content.
    - For TXT files (`external_links_`):
        - Directly read the text and standardize it (e.g., strip whitespace).

In [3]:
# Read & Preprocess HTML files
def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    #print(soup.prettify())
    
    clean_content = soup.get_text(separator='\n', strip=True) # Extract the textual content
    return clean_content
        

def read_and_clean_html(path):
    with open(path, 'r', encoding='utf-8') as html_file:
        html = html_file.read()
    return clean_html(html)

In [4]:
# Read & Preprocess TXT files:
def read_txt(path):
    try:
        with open(path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except UnicodeDecodeError:
        # Try an alternate encoding (e.g., latin-1)
        with open(path, 'r', encoding='latin-1') as file:
            return file.read().strip()


### 3. Calculate Similarity
- Exact duplicate identification: hashing (hashlib) 
- Similarity metric: difflib.SequenceMatcher (fast, easy, built-in).


***Similarity Threshold***

- Exact duplicates: Use hashing (100% identical).
- Near-duplicates (minor HTML changes): 
    - Above 0.97 similarity --> Trivial HTML differences
    - Below 0.97 --> Potentially substantial differences. Check manually.


[Note to self: Consider using other libraries like textdistance for richer comparisons]

In [7]:
# Find exact duplicates with `hashing`
def file_hash(content):
    return hashlib.sha256(content.encode('utf-8')).hexdigest()

In [8]:
# Find near-duplicates
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

### 4. Identify and Filter Duplicate Files

1.  a. Use hashes for exact duplicates (if identical, hashes match exactly).

    b. Use the 0.97 similarity threshold for near-duplicates.

2. Keep a log of near-duplicate pairs of files and unique files.

3. Create and save a dataframe of near_duplicate pairs and their similarity sores for record keeping

4. Create and save a dataframe of unique file pairs and their similarity scores for further inspection

In [9]:
seen_hashes = {} # Dictionary to store hashes of seen files
non_dup_files = [] # List to store non-exact duplicate files

# Go through each folder in file_groups
for folder, file_dict in folder_groups.items():
    
    # Loop through all capture files for this folder
    for file_path in file_dict["capture"]:
        content = read_and_clean_html(file_path)
        h = file_hash(content)  

        if h not in seen_hashes:
            seen_hashes[h] = file_path # Store the path of the first occurrence of this hash
            non_dup_files.append(file_path)
        else:
            print(f"Duplicate found: {file_path}, original: {seen_hashes[h]}")

print(f"\nTotal unique capture files: {len(non_dup_files)}")


Duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224048.html, original: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html
Duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110128/capture_20110128_000817.html, original: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html
Duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110209/capture_20110209_000415.html, original: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html
Duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110209/capture_20110209_000418.html, original: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/201101

In [47]:
unique_files = []
duplicate_files = set()
near_duplicate_pairs = []

threshold = 0.95

for i in range(len(non_dup_files)):
    file_a = non_dup_files[i]
    if file_a in duplicate_files:
        continue  # Already known duplicate

    is_duplicate = False  # Track if file_a is found as a duplicate to something older
    
    for j in range(i + 1, len(non_dup_files)):
        file_b = non_dup_files[j]
        if file_b in duplicate_files:
            continue

        sim = similarity(read_and_clean_html(file_a), read_and_clean_html(file_b))
        if sim >= threshold:
            # near-duplicate
            near_duplicate_pairs.append((file_a, file_b, sim))
            duplicate_files.add(file_b)
            print(f"Near-duplicate found: {file_a} and {file_b} sim={sim:.2f}")
        # else do nothing for sub-95% similarities

    # If file_a never got flagged as near-duplicate to anything older,
    # put it into unique_files.
    if file_a not in duplicate_files:
        unique_files.append(file_a)

print("Done. Unique:", len(unique_files), " Duplicates:", len(duplicate_files))


Near-duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110604/capture_20110604_055251.html sim=1.00
Near-duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20120103/capture_20120103_044502.html sim=1.00
Near-duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20120107/capture_20120107_081328.html sim=1.00
Near-duplicate found: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20120307/capture_20120307_061623.html and privacy-surveillance-tech/analysis/data/rawData/community_standa

In [None]:
# Create a DataFrame to store the near-duplicate pairs and similarity scores

near_duplicate_pairs_clean = []

for file_a, file_b, sim in near_duplicate_pairs:
    near_duplicate_pairs_clean.append((os.path.basename(file_a), 
                                       os.path.basename(file_b), 
                                       sim))
near_duplicate_df = pd.DataFrame(near_duplicate_pairs_clean, columns=['File A', 'File B', 'Similarity'])

# Save near-duplicate pairs df to CSV
'''csv_path = os.path.join(derived_data_path, "near_duplicate_pairs.csv")
near_duplicate_df.to_csv(csv_path, index=False)'''

In [None]:
similaritycheck_ls = []

# Take the first file in unique_files
file_a = unique_files[0]

# Compare file_a with all other files in unique_files
for i in range(1, len(unique_files)):  
    file_b = unique_files[i]
    sim = similarity(read_and_clean_html(file_a), read_and_clean_html(file_b))
    similaritycheck_ls.append((file_a, file_b, sim))
    print(f"Similarity check: {file_a} and {file_b} sim={sim:.2f}")

Similarity check: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20120307/capture_20120307_061623.html sim=0.08
Similarity check: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20120830/capture_20120830_210605.html sim=0.20
Similarity check: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20121120/capture_20121120_162304.html sim=0.19
Similarity check: privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/20110127/capture_20110127_224041.html and privacy-surveillance-tech/analysis/data/rawData/community_standards_archives/201

In [None]:
# Create and save a dataframe of unique file pairs and their similarity scores for further inspection

simcheck_clean = []

for file_a, file_b, sim in similaritycheck_ls:
    simcheck_clean.append((
        os.path.basename(file_a), 
        os.path.basename(file_b), 
        sim
    ))

simcheck_df = pd.DataFrame(simcheck_clean, columns=['File A', 'File B', 'Similarity'])
simcheck_df

# Save similarity check df to CSV. DE-COMMENT THE FOLLOWING LINES TO SAVE
'''
csv_path2 = os.path.join(derived_data_path, "similaritycheck_uniquefiles.csv")
simcheck_df.to_csv(csv_path2, index=False)
'''

In [58]:
# Create the final output path
OUTPUT_FOLDER = os.path.join(derived_data_path, "UNIQUE_HTML_FILES")

# Ensure the output directory exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

def save_unique_files(unique_files):
    """
    Save unique files to the OUTPUT_FOLDER.
    
    :param unique_files: A list of file paths to be saved.
    """
    for file_path in unique_files:
        try:
            # Extract the original file name from the path
            file_name = os.path.basename(file_path)
            
            # Create a new file path in the output folder
            destination_path = os.path.join(OUTPUT_FOLDER, file_name)
            
            # Copy the file to the new location
            shutil.copy2(file_path, destination_path)
            
            print(f"Saved: {file_name} -> {destination_path}")
        
        except Exception as e:
            print(f"Failed to save {file_path}: {e}")

In [None]:
# Save the unique files. DE-COMMENT the following line to run the function and save.
'''save_unique_files(unique_files)'''

Saved: capture_20110127_224041.html -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES/capture_20110127_224041.html
Saved: capture_20120307_061623.html -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES/capture_20120307_061623.html
Saved: capture_20120830_210605.html -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES/capture_20120830_210605.html
Saved: capture_20121120_162304.html -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES/capture_20121120_162304.html
Saved: capture_20121215_025058.html -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES/capture_20121215_025058.html
Saved: capture_20130825_113648.html -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES/capture_20130825_113648.html
Saved: capture_20131120_0750

### 5. Convert HTML files into TXT files

In [5]:
def clean_text(text):
    # Truncate everything before the line "Facebook Community Standards"
    # This pattern captures everything from the start of the text
    # up to "Facebook Community Standards" (non-greedy),
    # then keeps the latter and discards the prior.
    pattern0 = r'.*?(Facebook Community Standards.*)'
    truncated_text = re.sub(pattern0, r'\1', text, flags=re.DOTALL)

    # Remove unwanted header/login text
    pattern1 = r'Email\s+Password|Keep me logged in|Forgot your password\?|Sign Up'
    clean_text = re.sub(pattern1, '', truncated_text, flags=re.DOTALL | re.IGNORECASE).strip()
    
    # Replace multiple spaces with a single space
    pattern2 = r' {2,}'
    clnd_text = re.sub(pattern2, ' ', clean_text)

    # Replace multiple blank lines with a single newline
    pattern3 = r'\n{3,}'
    clean_line = re.sub(pattern3, '\n', clnd_text)

    return clean_line


In [None]:
# DE-COMMENT THE FOLLOWING CODE TO RUN THE FINAL CLEANING AND SAVE AS .txt
"""
SOURCE_FOLDER = 'privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_HTML_FILES'
OUTPUT_FOLDER = os.path.join(derived_data_path, "UNIQUE_FILES_TXT")

# Ensure the output directory exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    
try:
    # Get the list of files in the SOURCE_FOLDER
    files = sorted(os.listdir(SOURCE_FOLDER))
    
    if len(files) == 0:
        print("No files found in the SOURCE_FOLDER.")
  
    for file in files:  
        print(f"Processing file: {file}")
        
        file_path = os.path.join(SOURCE_FOLDER, file)

        # Only process .html files
        if not file.endswith('.html'):
            print(f"Skipping non-HTML file: {file}")
            continue

        try:
            # 1. Extract text from HTML
            html_cleaned = read_and_clean_html(file_path)
            
            # 2. Further clean that text with clean_text()
            fully_cleaned = clean_text(html_cleaned)
            
            # 3. Convert extension to .txt
            file_name = os.path.basename(file_path).replace('.html', '.txt')
            destination_path = os.path.join(OUTPUT_FOLDER, file_name)
            
            # 4. Save the final cleaned content
            with open(destination_path, 'w', encoding='utf-8') as output_file: 
                output_file.write(fully_cleaned)
            
            print(f"Saved: {file_name} -> {destination_path}")
        
        except Exception as e:
            print(f"Failed to save {file_path}: {e}")

        # Debug: print preview
        #print(fully_cleaned[:500]) # Show the first 500 characters of the cleaned file

except Exception as e:
    print(f"An error occurred: {e}")

"""

Processing file: capture_20110127_224041.html
Saved: capture_20110127_224041.txt -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT/capture_20110127_224041.txt
Processing file: capture_20120307_061623.html
Saved: capture_20120307_061623.txt -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT/capture_20120307_061623.txt
Processing file: capture_20120830_210605.html
Saved: capture_20120830_210605.txt -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT/capture_20120830_210605.txt
Processing file: capture_20121120_162304.html
Saved: capture_20121120_162304.txt -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT/capture_20121120_162304.txt
Processing file: capture_20121215_025058.html
Saved: capture_20121215_025058.txt -> privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES_TXT/cap

In [None]:
#DEBUGGING CELL
"""
SOURCE_FOLDER = 'privacy-surveillance-tech/analysis/data/derivedData/meta_comm_stndrds_sites/UNIQUE_FILES'
file = "capture_20110209_000418.html"

    
try:
    # Get the list of files in the SOURCE_FOLDER
    files = sorted(os.listdir(SOURCE_FOLDER))
          
    file_path = os.path.join(SOURCE_FOLDER, file)


    try:
        with open(file_path, 'r', encoding='utf-8') as html_file:
            html_content = html_file.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            #print(soup.prettify())
            clean_content = soup.get_text(separator='\n', strip=True)
            #print(clean_content)
            #return clean_content
    
    except Exception as e:
        print(f"Error: {e}")

        # Print cleaned content (debugging)
        #print(content[:500])  # Show the first 500 characters of the cleaned file

except Exception as e:
    print(f"An error occurred: {e}")


"""

In [None]:
for link in soup.find_all('a'):
    print(link.get('href'))

def get_links(timestamp, progress_data:
    """
    Crawl a single capture (and sub-links within the same domain/path).
    """

    date_str = timestamp[:8]
    time_str = timestamp[8:]
    out_dir = os.path.join(OUTPUT_DIR, date_str)
    os.makedirs(out_dir, exist_ok=True)

    filename = f"capture_{date_str}_{time_str}.html"
    file_path = os.path.join(out_dir, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    last_timestamp = progress_data.get("last_timestamp")
    if last_timestamp and last_timestamp != timestamp:
        prev_date_str = last_timestamp[:8]
        prev_time_str = last_timestamp[8:]
        prev_file_path = os.path.join(OUTPUT_DIR, prev_date_str, f"capture_{prev_date_str}_{prev_time_str}.html")
        if os.path.exists(prev_file_path):
            with open(prev_file_path, "r", encoding="utf-8") as f:
                old_html = f.read()
            diff_result = compare_texts(old_html, html)
            if diff_result:
                diff_file = os.path.join(out_dir, f"diff_{date_str}_{time_str}_vs_{last_timestamp}.txt")
                with open(diff_file, "w", encoding="utf-8") as df:
                    df.write("\n".join(diff_result))

    progress_data["last_timestamp"] = timestamp
    save_progress(progress_data)

    soup = BeautifulSoup(html, "html.parser")
    internal_links = extract_internal_links(soup, timestamp)

    external_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "facebook.com/communitystandards" not in href:
            external_links.append((a.text.strip(), href))

    ext_links_file = os.path.join(out_dir, f"external_links_{date_str}_{time_str}.txt")
    with open(ext_links_file, "w", encoding="utf-8") as ef:
        for text_val, link_val in external_links:
            ef.write(f"{text_val} -> {link_val}\n")

    for link in internal_links:
        if link not in visited:
            crawl_capture_link(link, progress_data, visited)

def crawl_capture_link(wayback_url, progress_data, visited):
    """
    Variation of crawl_capture that accepts a fully formed Wayback URL.
    Extract the <timestamp> from the URL to maintain consistent naming.
    """
    if wayback_url in visited:
        return
    visited.add(wayback_url)

    parts = wayback_url.split("/web/")
    if len(parts) < 2:
        return
    after_web = parts[1]
    ts_part = after_web.split("/")[0]  # e.g. "20200202020202id_"
    raw_ts = ts_part[:14]

    delay = random.uniform(MIN_DELAY, MAX_DELAY)
    time.sleep(delay)
    try:
        resp = requests.get(wayback_url, timeout=30)
        resp.raise_for_status()
    except requests.exceptions.ReadTimeout:
        print(f"Timeout while fetching {wayback_url}. Retrying in 5 seconds...")
        time.sleep(5)
        return
    except Exception as e:
        print(f"Error fetching {wayback_url}: {e}")
        return

    html = resp.text

    date_str = raw_ts[:8]
    time_str = raw_ts[8:]
    out_dir = os.path.join(OUTPUT_DIR, date_str)
    os.makedirs(out_dir, exist_ok=True)

    filename = f"capture_{date_str}_{time_str}.html"
    file_path = os.path.join(out_dir, filename)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html)

    progress_data["last_timestamp"] = raw_ts
    save_progress(progress_data)

    soup = BeautifulSoup(html, "html.parser")
    '''
    APPARENTLY THE extract_internal_links() FUNCTION DOESN'T WORK
    '''
    internal_links = extract_internal_links(soup, raw_ts)

    external_links = []
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "facebook.com/communitystandards" not in href:
            external_links.append((a.text.strip(), href))

    ext_links_file = os.path.join(out_dir, f"external_links_{date_str}_{time_str}.txt")
    with open(ext_links_file, "w", encoding="utf-8") as ef:
        for text_val, link_val in external_links:
            ef.write(f"{text_val} -> {link_val}\n")

    for link in internal_links:
        if link not in visited:
            crawl_capture_link(link, progress_data, visited)