In [11]:
import csv

def compare_csv_words(file1, file2, common_out="common.csv", unique1_out="unique_file1.csv", unique2_out="unique_file2.csv"):
    """
    Compare words between two CSV files (different columns), save results into new CSVs,
    and return statistics.

    Args:
        file1 (str): Path to the first CSV file (takes 2nd column)
        file2 (str): Path to the second CSV file (takes 1st column)
        common_out (str): Path to save merged common words
        unique1_out (str): Path to save unique words from file1
        unique2_out (str): Path to save unique words from file2

    Returns:
        dict: statistics
    """

    # --- Read file1 (index words by 2nd column) ---
    words1 = {}
    with open(file1, newline='', encoding="latin-1") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row and len(row) > 1:
                words1[row[1].strip()] = row  # key = word, value = full row

    # --- Read file2 (index words by 1st column) ---
    words2 = {}
    with open(file2, newline='', encoding="latin-1") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            if row:
                words2[row[0].strip()] = row  # key = word, value = full row

    set1, set2 = set(words1.keys()), set(words2.keys())

    # --- Find common and unique words ---
    common_words = set1 & set2
    unique_file1 = set1 - set2
    unique_file2 = set2 - set1

    # --- Write common merged rows ---
    with open(common_out, "w", newline='', encoding="latin-1") as csvfile:
        writer = csv.writer(csvfile)
        # optional header
        writer.writerow(["Word", "File1_Row", "File2_Row"])
        for word in sorted(common_words):
            writer.writerow([word, words1[word], words2[word]])

    # --- Write unique file1 rows ---
    with open(unique1_out, "w", newline='', encoding="latin-1") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Word", "File1_Row"])
        for word in sorted(unique_file1):
            writer.writerow([word, words1[word]])

    # --- Write unique file2 rows ---
    with open(unique2_out, "w", newline='', encoding="latin-1") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Word", "File2_Row"])
        for word in sorted(unique_file2):
            writer.writerow([word, words2[word]])

    # --- Return stats ---
    return {
        "common_count": len(common_words),
        "common_words": common_words,
        "unique_file1_count": len(unique_file1),
        "unique_file1_words": unique_file1,
        "unique_file2_count": len(unique_file2),
        "unique_file2_words": unique_file2,
        "total_file1": len(set1),
        "total_file2": len(set2)
    }


In [10]:
result = compare_csv_words(r"NGSL_lists\NGSL-GR_rank.csv", "4_NGSL_lists_combined.csv")

print("Common words:", result["common_count"])
print("Unique to file1:", result["unique_file1_count"], "out of", result["total_file1"])
print("Unique to file2:", result["unique_file2_count"], "out of", result["total_file2"])

Common words: 4031
Unique to file1: 1020 out of 5051
Unique to file2: 801 out of 4832
