<a href="https://colab.research.google.com/github/IKeeso/Project9_data/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install stanza
!pip install conllu
!git clone https://github.com/IKeeso/Project9_data.git

In [None]:
import stanza
from stanza.utils.conll import CoNLL
from conllu import parse
from conllu import TokenList
import re

!ls Project9_data


In [None]:
stanza.download('sv')
nlp = stanza.Pipeline(lang='sv')

In [4]:
def tag_and_convert(filename):
  %cd /content/Project9_data
  try:
    with open(filename, 'r', encoding='utf-8') as file:
      text = file.read()
      file.close()
  except:
    text = filename
  doc = nlp(text)
  pathname = filename.split(".")[0]
  conllufilename = f"{pathname}.conllu"
  CoNLL.write_doc2conll(doc, conllufilename)

In [None]:
tag_and_convert("expressen.txt")
tag_and_convert("Personer som är på spektrumet. Personer på spektrumet. Personer som har autism")

In [106]:
class SemanticAnalyzer:
  def __init__(self, file_path):
    self.file_path = file_path
    self.data = self.parse_conll_u(self.file_path)
    self.PFL_sentence_matches = self.person_first_lang(self.data)
    self.IFL_sentence_matches = self.identity_first_lang(self.data)

  def parse_conll_u(self, file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
      data = {}
      sentence = []
      for line in file:
        line = line.strip()
        if line.startswith('# text'):
          sentence = []
        if line.startswith("# sent_id"):
          line = line.split("=")
          key = line[1].strip()
          sentence = []
        elif line == '':
          if sentence:
            data[key] = sentence
        else:
            sentence.append(line.split('\t'))
    return data

  def person_first_lang(self, data):
    PFL_sents = {}
    PFL_regx = re.compile(r"har|med", re.IGNORECASE)

    for id, sentence in data.items():
      if self.has_autism(data[id]):
        for token in sentence:
          if token[1] == "autism":
            if sentence[int(token[0])-2][1] == "och".lower():
              if PFL_regx.match(sentence[int(sentence[int(token[6])-1][int(6)])-1][1]):
                PFL_sents[id] = sentence
            if PFL_regx.match(sentence[int(token[0])-2][1]):
              PFL_sents[id] = sentence
    return PFL_sents

  def identity_first_lang(self, data):
    IFL_sents = {}
    IFL_regx = re.compile(r"^.*(autistisk|autistiska)", re.IGNORECASE)

    for id, sentence in data.items():
      words = ' '.join(token[1] for token in sentence)
      if IFL_regx.match(words):
        IFL_sents[id] = sentence

    return IFL_sents


  def has_autism(self, sentence):
    autism_regx = re.compile(r"^.*\bautism|spektrumet")
    words = ' '.join(token[1] for token in sentence)
    if autism_regx.match(words):
      return True
    else:
      return False

####

def main(file, lang):
  analyzer = SemanticAnalyzer(file_path=f"{file.lower()}")
  if lang.lower() == "pfl":
    matches = analyzer.PFL_sentence_matches
  elif lang.lower() == "ifl":
    matches = analyzer.IFL_sentence_matches
  else:
    print("Invalid language choice. Please choose either 'PFL' or 'IFL'.")
    return

  for id, sentence in matches.items():
    print(f"Sentence ID: {id} | {' '.join(token[1] for token in sentence)}")
    print("")

  print(f"Total matching sentences: {len(matches)}")

if __name__ == "__main__":
    file_path = input("Enter the file path: ")
    if file_path.split(".")[1] != "conllu":
      tag_and_convert(file_path)
      file_path = f"{file_path.split('.')[0]}.conllu"
    lang = input("Enter the language (PFL or IFL): ")
    main(file_path, lang)

Enter the file path: expressen.txt
/content/Project9_data
Enter the language (PFL or IFL): ifl
Sentence ID: 0 | ” Det ser otroligt olika ut för olika autistiska personer ”

Sentence ID: 9 | – Det ser otroligt olika ut för olika autistiska personer och spektrumet kan vara stort i en och samma person , säger hon .

Sentence ID: 14 | – Stödet från Postkodlotteriet gör att vi kan erbjuda läger för autistiska barn och unga med deras föräldrar .

Total matching sentences: 3
