In [1]:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re


def create_new_list_file(tl):
    """
    This is the main file that gets called, iterating through the list of languages to run the task for.
    It takes a language, runs get_unmatched_words()
    and saves the result to a new file with the name 'new_{language}_WordList.tsv'
    """
    global language
    language = tl
    unmatched_words = get_unmatched_words()
    with open(f"FilteredLists/new_{language}_WordList.tsv", 'w') as new_file:
        tsv_writer = csv.writer(new_file, delimiter='\t')
        for word in unmatched_words:
            tsv_writer.writerow([word])


def get_word_list():
    """
    Reads the '{language}_WordList.tsv' file and returns the list of words minus the header row
    """
    word_list = []
    with open(f'./FreqWordLists/{language}_WordList.tsv', 'r') as wl_file:
        for row in wl_file:
            word_list.append(row.strip('\n'))
        return word_list[1:]


def get_target_item_list():
    """
    Reads a list of target items in a .tsv file named with the format:
        {language}_TargetItems.tsv      (e.g. italian_TargetItems.tsv)
    It then takes the first column of each row, adds it to a list.
    """
    target_item_list = []
    with open(f'./TargetItemLists/{language}_TargetItems.tsv', 'r') as ti_file:
        rd = csv.reader(ti_file, delimiter="\t", quotechar='"')
        for row in rd:
            target_item_list.append(row[0])
    target_item_list = target_item_list[1:]  # set the list equal to all rows minus the header row
    return clean_target_items(target_item_list)


In [2]:
def get_unmatched_words():
    """
    Takes a language name and returns a list of words
    from the corresponding word list that don't match stopwords or target items.
    The proper noun filtering doesn't work well, so it's being excluded.
    """
    word_list = get_word_list()
    target_item_list = get_target_item_list()
    unmatched_words = []
    removed_stopwords = []
    removed_target_items = []
    removed_proper_nouns = []

    for word in word_list:
        if language == 'korean' or language == 'japanese':
            if is_target_item(word, target_item_list):
                removed_target_items.append(word)
            else:
                unmatched_words.append(word)
        else:
            if is_stopword(word):
                removed_stopwords.append(word)
            elif is_target_item(word, target_item_list):
                removed_target_items.append(word)
            # elif is_proper_noun(word):
            #     removed_proper_nouns.append(word)
            else:
                unmatched_words.append(word)
    all_removed_words = removed_target_items + removed_stopwords + removed_proper_nouns
    print(f"-------------\n{language.upper()}\nFound {len(unmatched_words)} unmatched words in {language.title()} from the list of {len(word_list)} words")
    print(f"""
Removed {len(all_removed_words)} words from the word list
>> {len(removed_stopwords)} were stopwords:
    {removed_stopwords}""")
#     print(f"""
# >> {len(removed_proper_nouns)} were proper nouns:
#     {removed_proper_nouns}""")
    print(f"""
>> {len(removed_target_items)} already exist in CAS:
    {removed_target_items}
    """)
    # print("Words matched to existing items in CAS:")
    # print(removed_target_items)
    # print("Proper nouns removed:")
    # print(removed_proper_nouns)
    return unmatched_words



def is_proper_noun(word):
    """
    Returns True if the given word is a proper noun
    """
    word_as_list = [word]
    tagged_word = nltk.pos_tag(word_as_list)
    if tagged_word[0][1] == 'NNP':
        return True
    return False


def is_stopword(word):
    """
    Returns True if the word checked matches any of the stopwords from the nltk corpus
    """
    stop_words = stopwords.words(language)
    if word in stop_words:
        return True
    return False


def is_target_item(word, target_item_list):
    """
    Checks a word against the target item list and returns True if it is a target item
    """
    if word in target_item_list:
        return True
    return False


The following functions clean up the target items to improve matching with the word list. The Stopwords and tokenization don't work for Korean and Japanese, so they are treated separetely.

In [3]:
def clean_target_items(ti_list):
    """
    Takes a list of target items and returns the list with the strings cleaned
    """
    clean_list = []
    if language == 'korean' or language == 'japanese':
        for item in ti_list:
            item_no_punctuation = re.sub('\W+', ' ', item)
            if item_no_punctuation != '':
                clean_list.append(item_no_punctuation)
    else:
        for item in ti_list:
            item_no_semicolon = get_str_before_semicolon(item)
            item_no_apostrophe = get_str_after_apostrophe(item_no_semicolon)
            item_no_punctuation = re.sub('\W+', ' ', item_no_apostrophe)
            item_no_stopwords_list = [word for word in word_tokenize(item_no_punctuation, language) if not is_stopword(word)]
            clean_item = ' '.join(item_no_stopwords_list)
            # print(item, "\n>> ", clean_item, "\n----------------\n")
            if clean_item != '':
                clean_list.append(clean_item)
    return clean_list


def get_str_before_semicolon(item):
    """
    If the item contains a semicolon, return only the text before the semicolon
    """
    if "; " in item:
        new_item = item.split("; ")[0]
        # print("Changed ", item, " to ", new_item)
        return new_item
    return item


def get_str_after_apostrophe(item):
    """
    Checks it the language is FR or IT and returns the string after the apostrophe,
    this is to exclude words like " l' " or " d' " in French and Italian
    """
    if "'" in item and (language == 'italian' or language == 'french'):
        new_item = item.split("'")[1]
        # print(f'Changed "{item}"  to  "{new_item}"')
        return new_item
    return item


We iterate over the languages list to run the task for each prepared document in all languages.
The files should be in the following formats:
`'./FreqWordLists/{language}_WordList.tsv'`   and    `'./TargetItemLists/{language}_TargetItems.tsv'`

In [4]:
# We can iterate over the languages list to run the task for each prepared document in all languages
# The files should be in the following formats:
# '{language}_WordList.tsv'   and    '{language}_TargetItems.tsv'
languages = ['italian',
             'english',
             'spanish',
             'french',
             'german',
             'russian',
             'korean',
             'japanese']

for lang in languages:
    create_new_list_file(lang)

-------------
ITALIAN
Found 6035 unmatched words in Italian from the list of 7591 words

Removed 1556 words from the word list
>> 241 were stopwords:
    ['non', 'di', 'che', 'e', 'la', 'il', 'un', 'a', 'per', 'in', 'una', 'mi', 'ma', 'lo', 'le', 'si', 'ti', 'i', 'con', 'se', 'io', 'come', 'da', 'ci', 'questo', 'tu', 'più', 'mio', 'lei', 'gli', 'tutto', 'mia', 'ne', 'questa', 'chi', 'o', 'anche', 'quello', 'lui', 'suo', 'dove', 'uno', 'tuo', 'noi', 'su', 'vi', 'loro', 'voi', 'quanto', 'cui', 'tra', 'nostro', 'nostra', 'vostro', 'quale', 'quelli', 'contro', 'quella', 'sua', 'del', 'negli', 'quante', 'nello', 'coi', 'è', 'sono', 'ho', 'ha', 'hai', 'sei', 'al', 'era', 'della', 'alla', 'abbiamo', 'tutti', 'dei', 'sta', 'perché', 'sia', 'nel', 'tua', 'siamo', 'hanno', 'stai', 'delle', 'sto', 'ad', 'nella', 'sul', 'ed', 'fai', 'dai', 'dal', 'sarebbe', 'dalla', 'fosse', 'avete', 'sulla', 'miei', 'alle', 'facendo', 'abbia', 'ero', 'aveva', 'questi', 'siete', 'ai', 'avevo', 'degli', 'queste', '

-------------
ENGLISH
Found 3007 unmatched words in English from the list of 5000 words

Removed 1993 words from the word list
>> 163 were stopwords:
    ['the', 'be', 'and', 'of', 'a', 'in', 'to', 'have', 'to', 'it', 'that', 'for', 'you', 'he', 'with', 'on', 'do', 's', 'they', 'this', 'but', 'at', 'we', 'his', 'from', 'that', 'not', 'by', 'or', 'she', 'as', 'what', 'their', 'will', 'who', 'can', 'if', 'all', 'her', 'about', 'my', 'as', 'there', 'up', 'so', 'which', 'when', 'some', 'them', 'me', 'out', 'into', 'just', 'him', 'your', 'now', 'than', 'other', 'then', 'how', 'its', 'our', 'more', 'these', 'no', 'because', 'more', 'here', 'only', 'those', 'very', 'her', 'any', 'through', 'there', 'down', 'after', 'over', 'should', 'in', 'as', 'too', 'when', 'between', 'most', 'own', 'out', 'on', 'while', 'why', 'same', 'where', 'against', 'about', 'over', 'such', 'again', 'most', 'few', 'where', 'each', 'so', 'during', 'off', 'before', 'all', 'under', 'both', 'after', 'no', 'until', 'once',

-------------
SPANISH
Found 3512 unmatched words in Spanish from the list of 5000 words

Removed 1488 words from the word list
>> 75 were stopwords:
    ['de', 'que', 'y', 'en', 'un', 'a', 'él', 'lo', 'no', 'su', 'con', 'por', 'para', 'mí', 'lo', 'como', 'estar', 'me', 'más', 'este', 'le', 'se', 'yo', 'o', 'pero', 'ese', 'otro', 'mi', 'ya', 'porque', 'mucho', 'muy', 'sí', 'ti', 'te', 'también', 'qué', 'nos', 'tu', 'sin', 'eso', 'cuando', 'hasta', 'la', 'sobre', 'entre', 'desde', 'ella', 'poco', 'nuestro', 'ni', 'donde', 'nada', 'tanto', 'algo', 'esto', 'quien', 'durante', 'cual', 'nosotros', 'contra', 'tú', 'antes', 'ante', 'estado', 'fuera', 'sentido', 'uno', 'todo', 'mío', 'os', 'suyo', 'tuyo', 'vuestro', 'vosotros']

>> 1413 already exist in CAS:
    ['ser', 'tener', 'hacer', 'decir', 'poder', 'ir', 'si', 'dar', 'saber', 'año', 'mismo', 'querer', 'vez', 'dos', 'día', 'grande', 'así', 'pasar', 'deber', 'entonces', 'llegar', 'bien', 'tiempo', 'ahora', 'primero', 'creer', 'vida', 'deja

-------------
FRENCH
Found 3589 unmatched words in French from the list of 5000 words

Removed 1411 words from the word list
>> 43 were stopwords:
    ['le', 'de', 'un', 'à', 'et', 'en', 'que', 'pour', 'dans', 'ce', 'il', 'qui', 'ne', 'sur', 'se', 'pas', 'par', 'je', 'avec', 'son', 'on', 'mais', 'nous', 'ou', 'leur', 'y', 'elle', 'même', 'vous', 'mon', 'me', 'lui', 'notre', 'tu', 'moi', 'eux', 'soit', 'te', 'votre', 'ton', 'toi', 'été', 'est']

>> 1368 already exist in CAS:
    ['être', 'avoir', 'plus', 'pouvoir', 'tout', 'faire', 'mettre', 'autre', 'si', 'dire', 'devoir', 'avant', 'deux', 'prendre', 'aussi', 'donner', 'bien', 'où', 'fois', 'encore', 'nouveau', 'aller', 'entre', 'premier', 'déjà', 'grand', 'moins', 'aucun', 'temps', 'très', 'savoir', 'voir', 'sans', 'raison', 'dont', 'non', 'monde', 'jour', 'demander', 'alors', 'après', 'trouver', 'personne', 'rendre', 'dernier', 'venir', 'pendant', 'passer', 'peu', 'bon', 'depuis', 'point', 'heure', 'rester', 'seul', 'année', 'toujour

-------------
GERMAN
Found 3430 unmatched words in German from the list of 5009 words

Removed 1579 words from the word list
>> 109 were stopwords:
    ['der', 'und', 'in', 'sein', 'ein', 'haben', 'sie', 'werden', 'von', 'ich', 'nicht', 'es', 'mit', 'sich', 'er', 'auf', 'für', 'auch', 'an', 'dass', 'zu', 'als', 'können', 'dies', 'wie', 'wir', 'ihr', 'so', 'bei', 'sein', 'aber', 'man', 'noch', 'nach', 'oder', 'aus', 'was', 'nur', 'dann', 'wenn', 'um', 'kein', 'über', 'da', 'vor', 'mein', 'du', 'durch', 'viel', 'wollen', 'machen', 'andere', 'mir', 'mich', 'hier', 'also', 'jetzt', 'doch', 'wieder', 'uns', 'bis', 'einer', 'sehr', 'unser', 'jede', 'weil', 'unter', 'ihm', 'ihn', 'denn', 'etwas', 'selbst', 'gegen', 'zwischen', 'wo', 'nichts', 'nun', 'sondern', 'damit', 'ohne', 'einmal', 'ihnen', 'ob', 'dort', 'dazu', 'während', 'einige', 'zwar', 'meinen', 'dich', 'weiter', 'dein', 'dir', 'hinter', 'anders', 'dessen', 'sonst', 'damit', 'jene', 'manche', 'euch', 'am', 'derselbe', 'indem', 'hin'

-------------
RUSSIAN
Found 3494 unmatched words in Russian from the list of 5000 words

Removed 1506 words from the word list
>> 111 were stopwords:
    ['и', 'в', 'не', 'на', 'я', 'быть', 'с', 'он', 'что', 'а', 'этот', 'по', 'к', 'но', 'они', 'мы', 'она', 'как', 'то', 'из', 'у', 'вы', 'за', 'для', 'от', 'так', 'ты', 'о', 'что', 'же', 'такой', 'тот', 'или', 'если', 'только', 'его', 'один', 'бы', 'себя', 'другой', 'уже', 'когда', 'до', 'мой', 'чтобы', 'вот', 'кто', 'при', 'можно', 'сам', 'два', 'даже', 'раз', 'их', 'какой', 'со', 'там', 'после', 'ли', 'где', 'под', 'нет', 'без', 'ну', 'более', 'чем', 'надо', 'здесь', 'потом', 'да', 'сейчас', 'через', 'много', 'теперь', 'ни', 'тогда', 'тут', 'тоже', 'всегда', 'между', 'конечно', 'три', 'перед', 'над', 'хорошо', 'больше', 'почти', 'ведь', 'никогда', 'совсем', 'про', 'вдруг', 'нельзя', 'опять', 'наконец', 'куда', 'иногда', 'зачем', 'чуть', 'хоть', 'впрочем', 'лучше', 'всего', 'разве', 'будто', 'может', 'есть', 'чего', 'том', 'ничего', 'се

-------------
KOREAN
Found 3763 unmatched words in Korean from the list of 5000 words

Removed 1237 words from the word list
>> 0 were stopwords:
    []

>> 1237 already exist in CAS:
    ['고', '의', '다', '것', '하다', '그', '있다', '있다', '되다', '가다', '나', '이', '없다', '수', '보다', '말', '사람', '하다', '오다', '구', '우리', '다', '때', '만', '그', '같다', '주다', '안', '아', '보다', '그것', '또', '일', '년', '알다', '내', '지다', '다', '집', '나오다', '살다', '좋다', '받다', '먹다', '말하다', '뭐', '이', '잘', '모르다', '더', '크다', '좀', '뭐', '주다', '소리', '일', '이것', '많다', '들다', '저', '생각하다', '죽다', '여기', '생각', '문제', '듣다', '자기', '고', '얘기', '어떤', '저', '많이', '왜', '돈', '여자', '지금', '어디', '만들다', '들어가다', '개', '보다', '그냥', '다시', '그리고', '자', '오다', '원', '거기', '쓰다', '백', '전', '월', '나가다', '참', '예', '아이', '하나', '다른', '눈', '잡다', '앉다', '들다', '손', '안', '시간', '물', '정도', '만', '한국', '천', '가다', '응', '사회', '치다', '만나다', '어느', '찾다', '사다', '서다', '길', '쓰다', '몇', '네', '같이', '얼굴', '서울', '위', '일본', '마음', '하나', '오늘', '부르다', '나라', '지금', '미국', '너무', '요', '사실', '삼', '남자', '자리', '방', '시'