This code is used to extract data which might likely be post 2020, from OPUS 2024 korean <-> english subtitiles.

In [None]:
import re

# Input file paths
en_path = "OpenSubtitles.en-ko.en"
ko_path = "OpenSubtitles.en-ko.ko"

# Output file paths
out_en_path = "opus_data_post_2020_en.txt"
out_ko_path = "opus_data_post_2020_kr.txt"

# Define keywords and year pattern
post2020_keywords = [
    "covid-19", "covid", "coronavirus", "pandemic", "vaccine passport", "mask mandate",
    "pcr test", "rapid antigen test", "social distancing", "quarantine center",
    "vaccination drive", "double vaccination", "omicron", "delta variant", "hybrid immunity",
    "booster shot", "long covid", "antiviral pill",
    "chatgpt", "gpt-3", "gpt-4", "dall-e", "copilot", "openai", "stable diffusion",
    "large language model", "generative ai", "diffusion model", "llm", "midjourney",
    "bard", "sora", "mistral", "ai alignment", "anthropic", "ai safety", "ai hallucination",
    "remote work", "work from home", "wfh", "zoom meeting", "zoom call", "teams call",
    "google meet", "hybrid work", "digital nomad visa", "online class", "online learning",
    "remote learning", "lockdown", "virtual event",
    "tiktok trend", "tiktok challenge", "reels", "shorts", "cancel culture",
    "doomscrolling", "quiet quitting", "great resignation", "metaverse", "nft", "bored ape",
    "web3", "crypto winter", "solana", "meme stock", "gamestop", "wallstreetbets",
    "ukraine invasion", "russia ukraine war", "kyiv", "zelensky", "mariupol", "donbas",
    "belgorod", "nord stream", "covid wave", "supply chain disruption", "inflation spike",
    "silicon valley bank", "svb collapse", "evergrande crisis", "red sea crisis",
    "meta", "apple vision pro", "iphone 14", "iphone 15", "galaxy s23", "pixel 7",
    "tesla autopilot", "starlink", "neuralink", "spacex starship", "copilot",
    "squid game", "barbenheimer", "oppenheimer", "taylor swift eras tour", "bts hiatus",
    "world cup 2022", "olympics 2021", "olympics 2024", "euro 2020", "fifa world cup qatar",
    "messi miami",
    "roe v wade overturn", "blm protests 2021", "climate strike 2022", "cop26", "cop28",
    "net zero 2050", "heat dome", "wildfires 2021", "wildfires 2023", "el niño 2023", "work from home"
    "2021", "2022", "2023", "2024"
]

year_pattern = re.compile(r'\b(202[1-4])\b')

# Find indices of English lines likely post-2020
post2020_indices = []
with open(en_path, "r", encoding="utf-8") as en_file:
    for idx, line in enumerate(en_file):
        line_lower = line.lower()
        if any(kw in line_lower for kw in keywords) or year_pattern.search(line):
            post2020_indices.append(idx)

# Write filtered data to new files
with open(en_path, "r", encoding="utf-8") as en_file, \
     open(ko_path, "r", encoding="utf-8") as ko_file, \
     open(out_en_path, "w", encoding="utf-8") as out_en, \
     open(out_ko_path, "w", encoding="utf-8") as out_ko:
    for idx, (en_line, ko_line) in enumerate(zip(en_file, ko_file)):
        if idx in post2020_indices:
            out_en.write(en_line)
            out_ko.write(ko_line)

print(f"Found and wrote {len(post2020_indices)} post-2020 sentence pairs.")


Done! Found and wrote 20778 post-2020 sentence pairs.
