<a href="https://colab.research.google.com/github/Natesd05/URS-Linguistic-Justice/blob/main/fastText_File_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fasttext
!pip install -U spacy
import fasttext
import re
!pip install lingua-language-detector
from lingua import Language, LanguageDetectorBuilder


Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=a50043e2001ee3abb4f655e4bf6ab14905ad5fb21a1c8192a45499e1c760bd24
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1
Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━

In [None]:
class LanguageIdentification:

    def __init__(self, file_path):
        pretrained_lang_model = "/lid.176.ftz"
        self.model = fasttext.load_model(pretrained_lang_model)
        self.output = {}
        self.spanish_words = 0
        self.english_words = 0
        self.total_words = 0
        self.word_bank = ""
        self.no_spanish = []
        self.little_spanish = []
        self.most_spanish = []
        self.all_spanish = []
        self.no_spanish_count = 0
        self.little_spanish_count = 0
        self.most_spanish_count = 0
        self.all_spanish_count = 0
        self.total_sentence_count = 0

        with open(file_path, 'r') as file:
          self.word_bank = file.read()

    def predict_lang(self, text):
        text = text.replace('\n', ' ').strip()
        predictions = self.model.predict(text, k=5)
        return predictions

    def lang_printout(self):
        word_list = self.word_bank.split('\n')

        print("{:<5} {:<75} {:<10} {:<10}".format("Index", "Sentence", "Language", "Probability"))

        index = 0;

        for sentence in word_list:
            sentence = sentence.strip()
            if sentence:
              index += 1
              lang = self.predict_lang(sentence)
              self.total_words += 1
              filtered_lang = [(label, score) for label, score in zip(lang[0], lang[1]) if label in ['__label__en', '__label__es']]
              self.output[sentence] = filtered_lang

              if filtered_lang:
                  highest_prob_label = max(filtered_lang, key=lambda x: x[1])[0]
                  highest_prob_score = max(filtered_lang, key=lambda x: x[1])
                  if highest_prob_score[1] is not None and highest_prob_score[1] > 0.45:
                    if highest_prob_label == '__label__en':
                        self.english_words += 1
                        language = 'en'
                    elif highest_prob_label == '__label__es':
                        self.spanish_words += 1
                        language = 'es'
                    output_line = "{:<5} {:<75} {:<10} {:<10.4f}".format(index, sentence, language, highest_prob_score[1])
                    print(output_line)
              else:
                  output_line = "{:<5} {:<75} {:<10}".format(index, sentence, "N/A")
                  print(output_line)

    def sentence_sorter(self):
        word_list = self.word_bank.split('\n')

        for sentence in word_list:
            sentence = sentence.strip()

            if sentence:
                self.sentence_word_count = 0
                self.spanish_count = 0
                for word in sentence.split():  # Split sentence into words
                    languages = [Language.ENGLISH, Language.FRENCH, Language.GERMAN, Language.SPANISH]
                    detector = LanguageDetectorBuilder.from_languages(*languages).build()
                    if detector.detect_language_of(word) == Language.SPANISH:
                        self.spanish_count += 1
                    self.sentence_word_count += 1

                self.total_sentence_count += 1
                if (self.spanish_count == 0):
                    self.no_spanish_count += 1
                    self.no_spanish.append(sentence + '\n')
                    with open('/no_spanish.txt', 'a') as file:
                      file.write(sentence + '\n')
                elif (self.spanish_count >= 1 and self.spanish_count <= 3):
                    self.little_spanish_count += 1
                    self.little_spanish.append(sentence + '\n')
                    with open('/little_spanish.txt', 'a') as file:
                      file.write(sentence + '\n')
                elif (self.spanish_count > 3 and self.spanish_count < self.sentence_word_count):
                    self.most_spanish_count += 1
                    self.most_spanish.append(sentence + '\n')
                    with open('/most_spanish.txt', 'a') as file:
                      file.write(sentence + '\n')
                elif (self.spanish_count == self.sentence_word_count):
                    self.all_spanish_count += 1
                    self.all_spanish.append(sentence + '\n')
                    with open('/all_spanish.txt', 'a') as file:
                      file.write(sentence + '\n')

                self.spanish_count = 0
                self.sentence_word_count = 0

    def print_dictionary(self):
      print(self.output)

    def print_spanish(self):
      print(self.spanish_words)

    def print_english(self):
      print(self.english_words)

    def percentage_spanish(self):
      print(self.spanish_words/self.total_words)

    def print_total_words(self):
      print(self.total_words)

    def print_no_spanish(self):
      for sentence in self.no_spanish:
        print(sentence)

    def print_little_spanish(self):
      for sentence in self.little_spanish:
        print(sentence)

    def print_most_spanish(self):
      for sentence in self.most_spanish:
        print(sentence)

    def print_all_spanish(self):
      for sentence in self.all_spanish:
        print(sentence)

    def print_categories_counts(self):
      print(f"No Spanish: {self.no_spanish_count}")
      print(f"Little Spanish: {self.little_spanish_count}")
      print(f"Most Spanish: {self.most_spanish_count}")
      print(f"All Spanish: {self.all_spanish_count}")

    def print_spanish_sentence_percentage(self):
        percentage = (self.all_spanish_count / self.total_sentence_count) * 100
        print(f"Percentage of sentences in Spanish: {percentage:.2f}%")

    def print_total_sentence_count(self):
        print(f"Total number of sentences: {self.total_sentence_count}")

    def print_percentage_no(self):
        print(f"Percentage no spanish: {((self.no_spanish_count/self.total_sentence_count)*100):.2f}%")

    def print_percentage_little(self):
        print(f"Percentage little spanish: {((self.little_spanish_count/self.total_sentence_count)*100):.2f}%")

    def print_percentage_most(self):
        print(f"Percentage most spanish: {((self.most_spanish_count/self.total_sentence_count)*100):.2f}%")

    def print_percentage_all(self):
        print(f"Percentage all spanish: {((self.all_spanish_count/self.total_sentence_count)*100):.2f}%")

LANGUAGE = LanguageIdentification("/clean_sentences.txt")
#LANGUAGE.lang_printout()
#LANGUAGE.print_dictionary()
#LANGUAGE.print_spanish()
#LANGUAGE.print_english()
#LANGUAGE.percentage_spanish()
#LANGUAGE.print_total_words()
LANGUAGE.sentence_sorter()
LANGUAGE.print_most_spanish()
LANGUAGE.print_categories_counts()
LANGUAGE.print_spanish_sentence_percentage()
LANGUAGE.print_total_sentence_count()
LANGUAGE.print_percentage_no()
LANGUAGE.print_percentage_little()
LANGUAGE.print_percentage_most()
LANGUAGE.print_percentage_all()





ValueError: /lid.176.ftz cannot be opened for loading!