**Model Source:** https://huggingface.co/papluca/xlm-roberta-base-language-detection

**Purpose:** Perform Language detection for Large files that cannot be processed via googlesheets or other limited free API's. The performance and accuracy are identical to google sheets 'DETECTLANGUAGE' results.

In [None]:
!pip install transformers

In [None]:
import pandas as pd
from transformers import pipeline
from joblib import Parallel, delayed

language_detection_pipe = pipeline(
    "text-classification",
    model="papluca/xlm-roberta-base-language-detection",
    tokenizer="papluca/xlm-roberta-base-language-detection"
)

input_csv_file = "/content/input.csv"
output_csv_file = "Lang detect output.csv"

df = pd.read_csv(input_csv_file)

target_ram_usage_gb = 10
text_memory_usage_gb = 0.001
max_batch_size = int(target_ram_usage_gb / text_memory_usage_gb)
batch_size = min(max_batch_size, len(df))

detected_languages = []

average_text_length = df["Text"].apply(len).mean()
max_text_length = min(512, int(average_text_length * 1.2))

def process_batch(batch_start, batch_end):
    batch_texts = df["Text"][batch_start:batch_end].tolist()
    truncated_texts = [text[:max_text_length] if len(text) > max_text_length else text for text in batch_texts]
    batch_languages = language_detection_pipe(truncated_texts)
    return [result['label'] for result in batch_languages]

def convert_to_full_name(initial):
    language_names = {
        "af": "Afrikaans",
        "am": "Amharic",
        "ar": "Arabic",
        "az": "Azerbaijani",
        "be": "Belarusian",
        "bg": "Bulgarian",
        "bn": "Bengali",
        "bs": "Bosnian",
        "ca": "Catalan",
        "cs": "Czech",
        "cy": "Welsh",
        "da": "Danish",
        "de": "German",
        "el": "Greek",
        "en": "English",
        "eo": "Esperanto",
        "es": "Spanish",
        "et": "Estonian",
        "eu": "Basque",
        "fa": "Persian",
        "fi": "Finnish",
        "fil": "Filipino",
        "fr": "French",
        "ga": "Irish",
        "gl": "Galician",
        "gu": "Gujarati",
        "he": "Hebrew",
        "hi": "Hindi",
        "hr": "Croatian",
        "ht": "Haitian Creole",
        "hu": "Hungarian",
        "hy": "Armenian",
        "id": "Indonesian",
        "is": "Icelandic",
        "it": "Italian",
        "ja": "Japanese",
        "jv": "Javanese",
        "ka": "Georgian",
        "kk": "Kazakh",
        "km": "Khmer",
        "kn": "Kannada",
        "ko": "Korean",
        "ku": "Kurdish",
        "ky": "Kyrgyz",
        "la": "Latin",
        "lb": "Luxembourgish",
        "lo": "Lao",
        "lt": "Lithuanian",
        "lv": "Latvian",
        "mg": "Malagasy",
        "mi": "Maori",
        "mk": "Macedonian",
        "ml": "Malayalam",
        "mn": "Mongolian",
        "mr": "Marathi",
        "ms": "Malay",
        "mt": "Maltese",
        "nb": "Norwegian Bokmål",
        "ne": "Nepali",
        "nl": "Dutch",
        "nn": "Norwegian Nynorsk",
        "no": "Norwegian",
        "oc": "Occitan",
        "or": "Oriya",
        "pa": "Punjabi",
        "pl": "Polish",
        "ps": "Pashto",
        "pt": "Portuguese",
        "ro": "Romanian",
        "ru": "Russian",
        "si": "Sinhala",
        "sk": "Slovak",
        "sl": "Slovenian",
        "sq": "Albanian",
        "sr": "Serbian",
        "sv": "Swedish",
        "sw": "Swahili",
        "ta": "Tamil",
        "te": "Telugu",
        "th": "Thai",
        "tl": "Tagalog",
        "tr": "Turkish",
        "uk": "Ukrainian",
        "ur": "Urdu",
        "vi": "Vietnamese",
        "xh": "Xhosa",
        "yi": "Yiddish",
        "yo": "Yoruba",
        "zh": "Chinese",
        "zu": "Zulu",
        # You can add more language mappings as needed
    }
    return language_names.get(initial, initial)

results = Parallel(n_jobs=-1, batch_size=batch_size)(
    delayed(process_batch)(batch_start, batch_end)
    for batch_start in range(0, len(df), batch_size)
    for batch_end in [min(batch_start + batch_size, len(df))]
)

for result in results:
    detected_languages.extend(result)

df["Detected_Language"] = [convert_to_full_name(lang) for lang in detected_languages]

df.to_csv(output_csv_file, index=False)

print("Language detection complete. Output saved to", output_csv_file)


In [None]:
df