In [None]:
import spacy
from spacy import displacy
import pandas as pd
import numpy as np
from pathlib import Path
import nltk
from nltk.corpus import wordnet as wn
import swifter
import shutil
import os
import re

nlp = spacy.load("en_core_web_trf")
nlp.max_length = 2000000

nltk.download('wordnet')
nltk.download('omw-1.4') 

# Algorithm
## Classes

In [3]:
class Phrase:
    def __init__(self, type, dep, tokens=[], sub_phrases=[]):
        self.tokens = tokens
        self.type=type
        self.sub_phrases = sub_phrases
        self.parsing = None
        self.pronouns = None
        self.index = 0
        self.dep = dep
        self.word_category = []

    def __str__(self):
        sub_ph_str = "\n  SUBS:"
        for sub in self.sub_phrases:
            sub_ph_str = sub_ph_str + "\n    " + str(sub)
        if len(self.sub_phrases) == 0:
            sub_ph_str = ""
        return "Phrase(" + str(self.type) + " , " + str(self.dep) + " , " + " ".join([token.text for token in self.tokens]) + ")" + sub_ph_str

    def do_parsing(self):
        for sub in self.sub_phrases:
            sub.do_parsing()
        
        all_categories = []
        for token in self.tokens:
            if token.pos_ != "PRON":
                cats = self.get_word_category(token.text)
                if cats is not None:
                    self.word_category.append(cats)
                    all_categories.extend(cats)

        has_person = False
        has_group = False
        if all_categories is not None:
            if "noun.person" in all_categories:
                has_person = True
            if "noun.group" in all_categories:
                has_group = True

        # DO Parsing
        self.parsing = {
            "contains_name": self.contains_name(),
            "contains_orga": self.contains_orga(),
            "contains_person": has_person,
            "contain_group": has_group,#self.contain_group(),
            "contain_collective_noun": self.contain_collective_noun(),
            "contain_generic_noun": self.contain_generic_noun(),
            "is_plural": self.is_plural(),
            "plural_is_neutral": self.plural_is_neutral(),
            "categories": list(set(all_categories))
        }

        self.parse_pronouns()
        self.order_tokens()

    def order_tokens(self):
        self.tokens = sorted(self.tokens, key=lambda x: x.i)
        self.index = self.tokens[0].i

    # 
    def parse_pronouns(self):
        pronouns = []
        for token in self.tokens:
            if token.pos_ == "PRON":

                case = None
                if len(token.morph.get("Case")) > 0:
                    case = token.morph.get("Case")[0]

                plural = False
                if len(token.morph.get("Number")) > 0:
                    plural = token.morph.get("Number")[0] == "Plur"

                gender = None
                if len(token.morph.get("Gender")) > 0:
                    gender = token.morph.get("Gender")[0]

                poss = False
                if len(token.morph.get("Poss")) > 0:
                    poss = token.morph.get("Poss")[0] == "Yes"

                reflex = False
                if len(token.morph.get("Reflex")) > 0:
                    reflex = token.morph.get("Reflex")[0] == "Yes"

                person = None
                if len(token.morph.get("Person")) > 0:
                    person = token.morph.get("Person")[0]

                # Pro Pronomen den Bezug klären. Z.B. Obj pronomen -> Sub. / Prev Sub 
                # Wenn Possesiv Pronomen subject ist kann es sich auf object oder Subject des vorherigen Clause beziehen. 
                
                relates_to = []
                if reflex == True or poss == True:
                    relates_to.append("dsubj")

                if poss == True:
                    relates_to.append("psubj")
                
                pronouns.append(
                {
                    "token": token,
                    # "text": token.text, 
                    # "lemma": token.lemma_,
                    "person": person,
                    "case": case,
                    "plural": plural,
                    # "gender": gender,
                    # "poss": poss,
                    # "reflex": reflex,
                    "type": "refl" if reflex == True else ("poss" if poss == True else "pers"), 
                    "relates_to": relates_to
                })
        self.pronouns = pronouns
                
    def get_all_pronouns(self):
        pronouns = []
        pronouns.append(self.pronouns)
        for sub in self.sub_phrases:
            pronouns.append(sub.pronouns)
        
        return pronouns

    # Different Parsing functions
    def contains_name(self):
        for token in self.tokens:
            if "PERSON" in token.ent_type_:
                return True
        return False
    
    def contains_orga(self):
        for token in self.tokens:
            if "ORG" in token.ent_type_:
                return True
        return False
    
    def get_word_category(self, word):

        synsets = wn.synsets(word)
        if not synsets:
            return None
        categories = set()
        for synset in synsets:
            categories.add(synset.lexname())
        
        return categories

    def contain_group(self):
        for token in self.tokens:
            if "conj" in token.dep_:
                return True
        return False
    
    def contain_collective_noun(self):
        collective_nouns = ["people","army","audience","band","board","cast","choir","class","club","coalition","committee","community","company","congregation","corporation","council","crew","crowd","family","firm","gang","group","jury","majority","minority","mob","orchestra","panel","parliament","party","public","school","staff","team","troupe"]
        for token in self.tokens:
            if token.lemma_ in collective_nouns:
                return True
        return False

    def contain_generic_noun(self):
        neutral_nouns = 'Teenager|Challenger|Entertainer|Leader|Acquaintance|Catcher|Representative|Guide|Skier|Champion|Lifeguard|Biker|Consultant|Driver|Patient|Jogger|Author|Seafarer|Protector|Researcher|Comedian|Navigator|Sailor|Sibling|Master|Elderly|Nurse|Authority|Journalist|Lobbyist|Spectator|Activist|Pitcher|Competitor|Expert|Director|Child|Enthusiast|Counselor|Orator|Novelist|Peacemaker|Snowboarder|Manager|Officer|Trainee|Keeper|Helper|Intern|Negotiator|Classmate|Shortstop|Novice|Student|Supervisor|Doctor|Mariner|Captain|Coach|Artist|Official|Chief|Controller|Learner|Skateboarder|Singer|Dancer|Judge|Scholar|Spouse|Advocate|Trekker|Instructor|Jumper|Backpacker|Fielder|inistrator|Nomad|Boss|Advisor|Acrobat|Stranger|Politician|Batter|Pilot|Pupil|Cyclist|Runner|Apprentice|Correspondent|Fan|Academic|Rival|Umpire|Climber|Diplomat|Moderator|Demonstrator|Colleague|Volunteer|Outfielder|Writer|Ambassador|Diver|Executive|Specialist|Traveler|Speaker|Juggler|Watcher|Mediator|Someone|Mentor|Adventurer|Guardian|Wanderer|Marathoner|Athlete|Rescuer|Spokesperson|Resident|Steward|Professional|Roommate|Scientist|Linesman|Technician|Teacher|Observer|Amateur|Supporter|Campaigner|Citizen|Narrator|Analyst|Therapist|Player|Caregiver|Coordinator|Delegate|Explorer|Thrower|Mime|Adult|Neighbor|Tourist|Racer|Delivery person|Organizer|Baseman|Envoy|Magician|Hobbyist|Expeditionist|Partner|Storyteller|Defender|Clown|Educator|Motorcyclist|Messenger|Swimmer|Hurdler|Relative|Mountaineer|Friend|Reporter|Infielder|Follower|Participant|Actress|Fisherman|Caretaker|Reviewer|Sprinter|Playwright|Host|Executi|Protester|Broadcaster|Surfer|Tutor|Opponent|Custodian|Musician|Referee|Critic|Hiker|Presenter|Courier|Arbitrator|Parent|Administrator|Poet|Actor|Hitter|Trainer|Performer|Debater|Illusionist|Companion|Cousin'
        generic_nouns = neutral_nouns.lower().split("|")
        for token in self.tokens:
            if token.lemma_ in generic_nouns:
                return True
        return False
    
    def is_plural(self):
        for token in self.tokens:
            if "Plur" in token.morph.get("Number"):
                return True
        return False
    
    def plural_is_neutral(self):
        gendered_words = pd.read_json("./gendered_words.json")
        gendered_words['word'] = gendered_words['word'].str.replace("_", " ")

        for token in self.tokens:
            if "Plur" in token.morph.get("Number"):
                if len(gendered_words.loc[gendered_words['word'].str.strip().str.lower() == token.lemma_.strip().lower()]) > 0:
                    if gendered_words.loc[gendered_words['word'].str.strip().str.lower() == token.lemma_.strip().lower()]['gender'].to_list()[0] in ["m", "f"]:
                        return False
                return True
        return False

    def get_phrase_classification(self):
        classification = None

        indefinite_pronouns = ["all", "another", "any", "anybody", "anyone", "anything", "each", "everybody", "everyone", "everything", "few", "many", "nobody", "none", "one", "several", "some", "somebody", "someone", "something"]
        indefinite_sing_pronouns = ["another", "any", "anybody", "anyone", "anything", "each", "everybody", "everyone", "everything", "nobody", "one", "somebody", "someone", "something"]
        indefinite_plur_pronouns = ["few", "many", "several"]
        indefinite_context = ["all", "some", "none"]
        # Sonderfall all, some and none ?!
        
        if self.parsing['contains_name'] == True and self.parsing['contain_group'] == False:
            classification = "SINGULAR"
        elif self.parsing['contains_person'] == True and self.parsing['contain_group'] == False and self.parsing['is_plural'] != True:
            classification = "GENERIC"
        elif (self.parsing['contain_group'] == True or self.parsing['contain_collective_noun'] == True or self.parsing['is_plural'] == True) and len(self.pronouns) == 0:
            classification = "PLURAL"


        if self.pronouns is not None:
            for pronoun in self.pronouns:
                if pronoun['token'].text.strip().lower() in indefinite_sing_pronouns:
                    return "GENERIC"
                elif pronoun['token'].text.strip().lower() in indefinite_plur_pronouns:
                    return "PLURAL"
                
        return classification

class Clause:
    def __init__(self, type = "", phrases=[], sub_clauses=""):
        self.type = type
        self.subj = []
        self.obj = []
        self.sub_clauses = sub_clauses
        self.phrases = phrases
        self.index = 0

    def __str__(self):
        if len(self.phrases) > 0:
            phrases_str = "\n   Phrases:"
            for phrase in self.phrases:
                phrases_str = phrases_str + "\n     " + str(phrase)
        else:
            phrases_str = ""

        return "Clause(" + str(self.type) + "\n SUBJ: " + str(self.subj) + "\n OBJ: " + str(self.obj) # + phrases_str

    def append_phrases(self, phrases):
        for phrase in phrases:
            if phrase.type == "subj":
                self.subj.append(phrase)
            if phrase.type == "obj":
                self.obj.append(phrase)

    def do_parsing(self):
        for sub in self.subj:
            sub.do_parsing()
        for sub in self.obj:
            sub.do_parsing()
        # DO Parsing
        self.subj = sorted(self.subj, key=lambda x: x.index)
        self.obj = sorted(self.obj, key=lambda x: x.index)

        phrases = self.subj.copy()
        phrases.extend(self.obj)
        
        if len(phrases) == 0:
            self.index = 0
        else:
            self.index = sorted(phrases, key=lambda x: x.index)[0].index

    def pronom_relation(self, prev_clauses=[]):
        # for sub in self.subj:
        resolutions = []
        prev_clauses = prev_clauses.copy()
        
        subj_clause = Clause(type="SUBJ")
        subj_clause.subj = self.subj
        prev_clauses.append(subj_clause)

        prev_clauses.reverse()

        parsed_subj = False

        obj_index = 0     
        for sub in self.obj:
            pronoms = sub.get_all_pronouns()
            subindex = 0
            for pronoms_of_sub in pronoms:
                for pronom in pronoms_of_sub:
                    resolution, parsed_subj = self.resolve_single_pronom("OBJ", pronom, subindex, sub, prev_clauses, parsed_subj, obj_index)
                    if resolution is not None:
                            resolutions.append(resolution)

                subindex = subindex + 1
            obj_index = obj_index + 1
        
        subj_index = 0     
        # Parse Subject
        if parsed_subj == False:
            for sub in self.subj:
                pronoms = sub.get_all_pronouns()
                subindex = 0
                for pronoms_of_sub in pronoms:
                    for pronom in pronoms_of_sub:
                        resolution, parsed_subj = self.resolve_single_pronom("SUBJ", pronom, subindex, sub, prev_clauses, parsed_subj, subj_index)
                        if resolution is not None:
                            resolutions.append(resolution)

                    subindex = subindex + 1
                subj_index = subj_index + 1
        
        return resolutions
    
    def resolve_single_pronom(self, type, pronom, subindex, sub, prev_clauses, parsed_subj, index):
        
        relates_to_prev = False
        resolution = PronomResolution(pronom)

        # Possesiv Check 
        if pronom['type'] == "poss":
            if subindex > 1:
                for neighbour in sub.sub_phrases[0:(subindex-1)]:
                    if neighbour.get_phrase_classification() is not None:
                        resolution.relates_to = neighbour
                        resolution.classification = neighbour.get_phrase_classification()
                        return (resolution, parsed_subj)
                    elif neighbour.pronouns is not None and neighbour.pronouns is not None:
                        relates_to_prev = True
            else:
                if type == "OBJ" and index > 0:
                    for obj in self.obj[0:index]:
                        if obj.get_phrase_classification() is not None:
                            resolution.relates_to = obj
                            resolution.classification = obj.get_phrase_classification()
                            return (resolution, parsed_subj)
                elif type == "SUBJ" and index > 0:
                    for subj in self.subj[0:index]:
                        if subj.get_phrase_classification() is not None:
                            resolution.relates_to = subj
                            resolution.classification = subj.get_phrase_classification()
                            return (resolution, parsed_subj)

                # Bezieht sich auf Subjekt
                for subj in self.subj:
                    if subj.get_phrase_classification() is not None:
                        resolution.relates_to = subj
                        resolution.classification = subj.get_phrase_classification()
                        return (resolution, parsed_subj)
                    elif subj.pronouns is not None and subj.pronouns is not None:
                        relates_to_prev = True
                                

        if (pronom['type'] != "poss") or (relates_to_prev == True):
            parsed_subj == True
            if self.parse_prev_clauses(prev_clauses) is not None:
                relates_to, classification = self.parse_prev_clauses(prev_clauses)
                resolution.relates_to = relates_to
                resolution.classification = classification

        return (resolution, parsed_subj)

    def parse_prev_clauses(self, prev_clauses):
        for prev_c in prev_clauses:
            for subj in prev_c.subj:
                if subj.get_phrase_classification() is not None:
                    return(subj, subj.get_phrase_classification())
                else:
                    for sub in subj.sub_phrases:
                        if sub.get_phrase_classification() is not None:
                            return(sub, sub.get_phrase_classification())
                            
            for obj in prev_c.obj:
                if obj.get_phrase_classification() is not None:
                    return(obj, obj.get_phrase_classification())
                
                for sub in obj.sub_phrases:
                    if sub.get_phrase_classification() is not None:
                        return(sub, sub.get_phrase_classification())


class CompoundSentence:
    def __init__(self, clauses = []):
        self.clauses = clauses

    def do_parsing(self):
        for clause in self.clauses:
            clause.do_parsing()
        # DO Parsing
        self.clauses = sorted(self.clauses, key=lambda x: x.index)

    def pronom_relation(self):
        resolutions = []
        prev_clauses = []
        for clause in self.clauses:
            resolutions.extend(clause.pronom_relation(prev_clauses))
            prev_clauses.append(clause)
        return resolutions


class PronomResolution:
    def __init__(self, pronom):
        self.pronom = pronom
        self.relates_to = None
        self.classification = None

    def get_person(self):
        return self.pronom['person']
    
    def get_numerus(self):
        return "plural" if self.pronom['plural'] else "singular"
    
    def get_classification(self):
        return self.classification

    def __str__(self):
        return str(self.pronom['person']) + "-" + ("plural" if self.pronom['plural'] else "singular")+ "-" + self.pronom['type'] + "-" + str(self.classification) + " ("+ self.pronom['token'].text +")"


## Parsing

In [4]:
def parse_children(root, depth=0):
    tokens = []
    phrases = []
    clauses = []
    phrase = None
    clause = None

    # Create Clause if no clause is found
    if depth == 0:
        clause = Clause(type = "main", phrases=[])
        clauses.append(clause)

    # Create Clause based on found clause
    if "cl" in root.dep_ and root.dep_ != "acl":
        clause = Clause(type = root.dep_, phrases=[])
        clauses.append(clause)
    
    # Iterate over children
    for child in root.children:

        # Check if new Subject or Object Phrase is detected
        if "subj" in child.dep_ or "obj" in child.dep_:
            # create type subj or obj
            type = "subj" if "subj" in child.dep_ else "obj"
            dep = child.dep_
            # create Phrase containing all nestedtokens and phrases
            phrase = Phrase(type, dep, [], [])
            tokens = []
            # Append phrase to Phrase List
            phrases.append(phrase)

        # Append Token
        if(phrase != None):
            phrase.tokens.append(child)
        else:
            tokens.append(child)

        # process childs childrens data
        if len(list(child.children)) > 0:
            # Parse Child
            to, ph, cl = parse_children(child, depth+1)
            # Append Parsed childs data
            if phrase != None:
                phrase.tokens.extend(to)
                phrase.sub_phrases.extend(ph)
            else:
                tokens.extend(to)
                phrases.extend(ph)

            clauses.extend(cl)

        if "subj" in child.dep_ or "obj" in child.dep_:
            tokens = []
            phrase = None

    if clause != None:
        clause.append_phrases(phrases)
        phrases=[]
        

    return (tokens, phrases, clauses)

In [5]:
def classify_text(text):
    doc = nlp(text) 
    sentences = []
    classifications = []

    for token in doc:
        if token.dep_ == "ROOT":
            #print(token)
            to, ph, cl = parse_children(token)
            sentence = CompoundSentence(clauses=cl)

            sentence.do_parsing()
            classifications.extend(sentence.pronom_relation())
            
            sentences.append(sentence)
    return classifications

def format_classifications(classifications):
    fmt_class = []
    for classification in classifications:
        if classification.get_person() == "3" and classification.get_numerus() == "plural" and classification.get_classification() == "GENERIC":
            fmt_class.append("GENERIC-THEY")
        elif classification.get_person() == "3" and classification.get_numerus() == "plural" and classification.get_classification() == "SINGULAR":
            fmt_class.append("NON-BINARY-THEY")
        elif classification.get_person() == "3" and classification.get_numerus() == "singular" and classification.get_classification() == "GENERIC":
            fmt_class.append("GENERIC-HE")
    return str(list(set(fmt_class)))

def classification_to_str(classification):
    str_class = []
    for classi in classification:
        str_class.append(str(classi))
    return str_class

# Run on Data

In [None]:
corpus_text_path = "/text/"
classified_text_path = "/classified/"

In [8]:
# Get List of Months
total_docs = 0
months = []
for txtfile in os.listdir(corpus_text_path):
    month = re.search('.*(\d\d[-_]\d\d)-.*', txtfile, re.IGNORECASE)
    month_alt = re.search('.*(\d\d[-_]\d\d)[-_]\d\d-.*', txtfile, re.IGNORECASE)
    if month is not None:
        month = month.group(1)
        if month_alt is not None:
            month = month_alt.group(1)
        months.append(month)
    else:
        print(txtfile)

total_docs = len(months)
months = list(set(months))
months.sort()

## Process Months

In [None]:
def process_month(month):
    print("started processing: ", month)

    month_df = None

    for txtfile in os.listdir(corpus_text_path):
        txtmonth = re.search('.*(\d\d[-_]\d\d)-.*', txtfile, re.IGNORECASE)
        month_alt = re.search('.*(\d\d[-_]\d\d)[-_]\d\d-.*', txtfile, re.IGNORECASE)
        if txtmonth is not None:
            txtmonth = txtmonth.group(1)
            if month_alt is not None:
                txtmonth = month_alt.group(1)
                
            if txtmonth == month:
                texts = pd.read_csv(os.path.join(corpus_text_path,txtfile), delimiter="\t", encoding = "ISO-8859-1", usecols=[0], names=['text'], header=None)
                texts['id'] = texts['text'].str.extract(r'\@\@(\d*)')
                texts['text'] = texts['text'].str.extract(r'\@\@\d* (.*)')
                texts['text'] = texts['text'].str.replace(r'\<.*\>', "", regex=True)
                texts = texts[['text', 'id']]

                if len(texts.loc[(texts['id'].isna()) | (texts['id'].isnull())]) > 1:
                    print("Fehler: ", txtfile, " | Na/Null:", len(texts.loc[(texts['id'].isna()) | (texts['id'].isnull())]))
                
                texts = texts.loc[(texts['id'].notna()) & (texts['id'].notnull())]

                if len(texts) < 10:
                    print("Warnung: ", txtfile, " | Zeilen:", len(texts))


                if month_df is None:
                    month_df = texts
                else:
                    month_df = pd.concat([month_df, texts], ignore_index=True)
    
    month_df = month_df.loc[month_df['text'].isnull() == False]

    month_df['classification'] = month_df['text'].swifter.apply(lambda x:classify_text(x))
    month_df['labels'] = month_df['classification'].swifter.apply(lambda x:format_classifications(x))

    month_df['GENERIC-THEY'] = month_df['labels'].swifter.apply(lambda x: "GENERIC-THEY" in x)
    month_df['GENERIC-HE'] = month_df['labels'].swifter.apply(lambda x: "GENERIC-HE" in x)
    month_df['NON-BINARY-THEY'] = month_df['labels'].swifter.apply(lambda x: "NON-BINARY-THEY" in x)

    month_df['classification'] = month_df['classification'].swifter.apply(lambda x:classification_to_str(x))
    month_df.to_feather(os.path.join(classified_text_path, month+".feather"))
    
    print("finished processing: ", month)

In [None]:
for month in months:
    process_month(month)