Detect English

In [None]:
import pandas as pd
import langid
from tqdm import tqdm

# ===============================================
# 📂 File Paths (Edit as needed)
# ===============================================
INPUT_CSV = r"C:\Users\User\Desktop\Datathon\spam_output(A).csv"
OUTPUT_CSV = r"C:\Users\User\Desktop\Datathon\lang_detect.csv"

# ===============================================
# 📥 Load CSV
# ===============================================
try:
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")
    print(f"✅ Loaded {len(df)} rows from: {INPUT_CSV}")
except FileNotFoundError:
    print(f"❌ File not found: {INPUT_CSV}")
    raise SystemExit

# ===============================================
# 🧠 Language Detection Logic
# ===============================================
def is_english(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0
    lang, _ = langid.classify(text)
    return 1 if lang == "en" else 0

# ===============================================
# 🚀 Detect English with Clean tqdm Bar
# ===============================================
columns_to_check = ["textOriginal", "title"]

for col in columns_to_check:
    if col not in df.columns:
        print(f"⚠️ Column '{col}' not found. Skipping.")
        continue

    print(f"🧠 Detecting English in column: {col}")
    df[f"is_en_{col}"] = list(
        tqdm(
            (is_english(text) for text in df[col]),
            total=len(df),
            desc=f"Detecting English: {col}",
            unit="rows",
        )
    )

# ===============================================
# 💾 Save Result
# ===============================================
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"🎉 English detection complete. Output saved to:\n{OUTPUT_CSV}")


✅ Loaded 889782 rows from: C:\Users\User\Desktop\Datathon\spam_output(A).csv
🧠 Detecting English in column: textOriginal


Detecting English: textOriginal: 100%|██████████████████████████████████████| 889782/889782 [53:01<00:00, 279.68rows/s]


🧠 Detecting English in column: title


Detecting English: title: 100%|█████████████████████████████████████████████| 889782/889782 [54:37<00:00, 271.50rows/s]


🎉 English detection complete. Output saved to:
C:\Users\User\Desktop\Datathon\lang_detect.csv


Remove Non-English

In [None]:
import pandas as pd

# ==== CONFIG ====
INPUT_CSV  = r"C:\Users\User\Desktop\Datathon\lang_detect.csv"
OUTPUT_CSV = r"C:\Users\User\Desktop\Datathon\lang_remove.csv"

# ==== 1) Load CSV ====
df = pd.read_csv(INPUT_CSV)

# ==== 2) Filter only English rows ====
df_en = df[df["is_en_textOriginal"] == 1]

# ==== 3) Save filtered data ====
df_en.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")

print(f"✅ Done! From {len(df)} rows → kept {len(df_en)} English rows.")
print(f"File saved to: {OUTPUT_CSV}")


✅ Done! From 889782 rows → kept 623223 English rows.
File saved to: C:\Users\User\Desktop\Datathon\lang_remove.csv
