In [10]:
!pip install PyMuPDF

Defaulting to user installation because normal site-packages is not writeable
Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)
   ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
   ---------- ----------------------------- 4.2/16.6 MB 25.2 MB/s eta 0:00:01
   ------------------------- -------------- 10.5/16.6 MB 27.3 MB/s eta 0:00:01
   -------------------------------------- - 16.0/16.6 MB 27.2 MB/s eta 0:00:01
   ---------------------------------------- 16.6/16.6 MB 24.9 MB/s eta 0:00:00
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5




In [11]:
import fitz  # PyMuPDF
import difflib

# Function to load and extract text from a PDF
def load_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = ""
    for page_num in range(len(doc)):
        page = doc[page_num]
        all_text += page.get_text()
    return all_text

# Load both PDF files (raw string used to avoid escape sequence warnings)
text_us = load_pdf_text(r"data\PlayStation_Policy_US.pdf")
text_eu = load_pdf_text(r"data\PlayStation_Policy_EU.pdf")

# Compare the two documents
diff = difflib.unified_diff(
    text_us.splitlines(),
    text_eu.splitlines(),
    fromfile='US Policy',
    tofile='EU Policy',
    lineterm=''
)

# Print the differences
for line in diff:
    print(line)

--- US Policy
+++ EU Policy
@@ -1,20 +1,21 @@
 About Us and this Policy 
+Who we are and how to get in touch 
 This Privacy Policy explains when we collect information about you, including Personal 
 Information (“PI”), what we collect, why we collect it, how we use it, who we share it with, where 
 it is processed, how we handle it and your choices and legal rights associated with this information. 
-Your use of our websites, products, services, or other online activities (“Services”) constitutes your 
-consent to these practices. 
 Scope of this Policy 
-Sony Interactive Entertainment LLC, Naughty Dog LLC, Sucker Punch Productions LLC, 
-Insomniac Games Inc., Bluepoint Games Inc., Valkyrie Entertainment LLC, Haven Interactive 
-Studios ULC, Repeat Technologies Inc., Firewalk Studios, LLC, PlayStation Publishing LLC, and 
-all Americas subsidiaries using the brand name PlayStation ("SIE", “we,” “our,” and “us”) 
-controls the information collected when you interact with PlayStation th

In [12]:
# Split into clauses/paragraphs (could also split by \n\n if needed)
clauses_us = [clause.strip() for clause in text_us.split('\n') if clause.strip()]
clauses_eu = [clause.strip() for clause in text_eu.split('\n') if clause.strip()]

In [13]:
# Compare clauses using SequenceMatcher
matcher = difflib.SequenceMatcher(None, clauses_us, clauses_eu)

# Display similar or different clauses
for opcode in matcher.get_opcodes():
    tag, i1, i2, j1, j2 = opcode

    if tag == 'equal':
        continue  # Skip identical clauses

    print(f"\n--- Difference: {tag.upper()} ---")
    print(f"US Policy Clause(s) [{i1}:{i2}]:")
    for clause in clauses_us[i1:i2]:
        print(f"  • {clause}")

    print(f"EU Policy Clause(s) [{j1}:{j2}]:")
    for clause in clauses_eu[j1:j2]:
        print(f"  • {clause}")


--- Difference: INSERT ---
US Policy Clause(s) [1:1]:
EU Policy Clause(s) [1:2]:
  • Who we are and how to get in touch

--- Difference: DELETE ---
US Policy Clause(s) [4:6]:
  • Your use of our websites, products, services, or other online activities (“Services”) constitutes your
  • consent to these practices.
EU Policy Clause(s) [5:5]:

--- Difference: REPLACE ---
US Policy Clause(s) [7:16]:
  • Sony Interactive Entertainment LLC, Naughty Dog LLC, Sucker Punch Productions LLC,
  • Insomniac Games Inc., Bluepoint Games Inc., Valkyrie Entertainment LLC, Haven Interactive
  • Studios ULC, Repeat Technologies Inc., Firewalk Studios, LLC, PlayStation Publishing LLC, and
  • all Americas subsidiaries using the brand name PlayStation ("SIE", “we,” “our,” and “us”)
  • controls the information collected when you interact with PlayStation through our Services.
  • Contact Us
  • Please contact us with any privacy questions by phone at 1-800-345-7669 or online
  • at http://www.playstation.c

In [14]:
# Threshold for similarity (0.8 = 80%)
SIMILARITY_THRESHOLD = 0.8

print("\n--- SIMILAR CLAUSES (US vs EU) ---\n")

for us_clause in clauses_us:
    for eu_clause in clauses_eu:
        ratio = difflib.SequenceMatcher(None, us_clause, eu_clause).ratio()
        if ratio >= SIMILARITY_THRESHOLD:
            print(f"US: {us_clause}")
            print(f"EU: {eu_clause}")
            print(f"Similarity: {round(ratio*100, 2)}%\n")
            break  # Only show first match per US clause (optional)


--- SIMILAR CLAUSES (US vs EU) ---

US: About Us and this Policy
EU: About Us and this Policy
Similarity: 100.0%

US: This Privacy Policy explains when we collect information about you, including Personal
EU: This Privacy Policy explains when we collect information about you, including Personal
Similarity: 100.0%

US: Information (“PI”), what we collect, why we collect it, how we use it, who we share it with, where
EU: Information (“PI”), what we collect, why we collect it, how we use it, who we share it with, where
Similarity: 100.0%

US: it is processed, how we handle it and your choices and legal rights associated with this information.
EU: it is processed, how we handle it and your choices and legal rights associated with this information.
Similarity: 100.0%

US: Scope of this Policy
EU: Scope of this Policy
Similarity: 100.0%

US: controls the information collected when you interact with PlayStation through our Services.
EU: controls the information collected when you interact wi