# Assignment 1: Linguistic Essentials and Collocations

## Installation
Required installations:
* Python 3
* Pip
* nltk library

## Imports

In [158]:
import nltk
from nltk.collocations import *
from nltk.corpus import brown, wordnet
from nltk.util import bigrams

import math
from tqdm import tqdm

## Task 1.1- Finding collocations
* Create a tool to find collocations using (1) frequency plus part-of-speech tagging (search for adjectives and nouns) and (2) hypothesis testing (see slides for Lecture 2). Use the Brown corpus (already in NLTK).
* Consider sequences of 2 words (bigrams).
* Generate files containing the collocations.

In [159]:
class Collocation_Finder:
    
    def __init__(self, frequency=5, corpus=brown):
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        self.corpus = corpus.words()
        self.corpus_length = len(self.corpus)
        
        self.word_counts = {}
        for word in self.corpus:
            if word.lower() in self.word_counts:
                self.word_counts[word.lower()] += 1
            else:
                self.word_counts[word.lower()] = 1

        self.finder = BigramCollocationFinder.from_words(self.corpus)
        self.finder.apply_freq_filter(frequency)
        common_bigrams = finder.nbest(bigram_measures.pmi, 10000)

        tagged_bigrams = [nltk.pos_tag(bigram) for bigram in common_bigrams]
        
        self.accepted_pos = ["NN", "JJ"]
        self.frequency_pos_bigrams = [bigram for bigram in tagged_bigrams if self._check_pos_in_ngram(bigram, accepted_pos)]
        
    def _check_pos_in_ngram(self, ngram, accepted):
        """(string, ..., string), list(string) -> bool"""
        for _, pos in ngram:
            found = False
            for word_class in accepted:
                if word_class in pos:
                    found = True
            if not found: # if a word is wrong we immediately return
                return False
        return True


    def get_frequency_pos_tagged(self):
        return self.frequency_pos_bigrams

    def _t_test(self, bigram, confidence):
        # look bigram up in frequency dictionary
        sample_mean = finder.ngram_fd[bigram] / (corpus_length - 1)

        word1, word2 = bigram

        # find the mean of distribution
        mean_of_distribution = (word_counts[word1] / corpus_length) * (word_counts[word2] / corpus_length)

        t = (sample_mean - mean_of_distribution) / (math.sqrt(sample_mean * (1 - sample_mean)) / corpus_length)

        return t > confidence 

    def get_hypothesis_tested(self, confidence=2.576):
        t_tested_bigrams = []
        for bigram in self.frequency_pos_bigrams:
            if self._t_test((bigram[0][0], bigram[1][0]), confidence):
                t_tested_bigrams.append(bigram)
        return t_tested_bigrams
        
        

In [160]:
cf = Collocation_Finder(frequency=10)

#write to file
f = open('1.1.frequency_pos_tagged.txt', 'w')
for bigram in tqdm(cf.get_frequency_pos_tagged()):
    f.write(bigram[0][0].lower() + " " + bigram[1][0].lower())
    f.write('\n')
f.close()

g = open('1.1.hypothesis_testing.txt', 'w')
for bigram in tqdm(cf.get_hypothesis_tested()):
    g.write(bigram[0][0].lower() + " " + bigram[1][0].lower())
    g.write('\n')
g.close()

100%|████████████████████████████████████| 335/335 [00:00<00:00, 1248970.52it/s]
100%|████████████████████████████████████| 335/335 [00:00<00:00, 1643382.27it/s]


## Task 1.2 - Correction Tool
* Create a simple tool that corrects non-natural expressions. In detail, your tool should receive as input two or three words. If there is a collocation in your files such that the i-th word is a synonym of the i-th word given as input then the algorithm will output the first such collocation in your files (consider that two words that are the same are synonyms). For example, if it receives “powerful tea” and “strong tea” is in your list then the algorithm should print “strong tea”.

* Suggestion: Use WordNet to detect synonyms.

In [161]:
class Correction_Tool:
    
    # defaults to use the collocations from the hypothesis testing in 1.1
    def __init__(self, path="1.1.hypothesis_testing.txt"):
        with open(path, "r") as file:
            collocs_raw = file.read()
        # take the raw string and split it up into collocations
        # check for last line in file which is just \n.
        self.collocations = [bigram.split()for bigram in collocs_raw.split('\n') if bigram != '']
        self.collocations_first_word_set = set(bigram[0] for bigram in self.collocations)

    def find_synonyms(self, word):
        """string -> list[string]"""
        synonym_names = []
        for synset in wordnet.synsets(word):
            synonym_names.extend(synset.lemma_names())
        return synonym_names

    def correct_bigram(self, bigram):
        """(string, string) -> (string, string)"""
        first_word, second_word = bigram
        for first_word_synonym in self.find_synonyms(first_word):
            if first_word_synonym in self.collocations_first_word_set:
                # create set of second words in known collocation list
                # that has the synonym of the first word we are looking for as
                # as its pairing
                collocations_second_word_set = set(bigram[1] for bigram in self.collocations if bigram[0] == first_word_synonym)
                for collocation_second_word in collocations_second_word_set:
                    # the collocation we are correcting is already in the known collocation list
                    if first_word + " " + second_word == first_word_synonym +  " " + collocation_second_word:
                        print(first_word + " " + second_word + " is already a known collocation")
                        return (first_word, second_word)
                        
                    for second_word_synonym in self.find_synonyms(second_word):
                        if second_word_synonym == collocation_second_word:
                            print("Corrected:", first_word + " " + second_word, "to", first_word_synonym + " " + collocation_second_word)
                            return (first_word_synonym, collocation_second_word)
        return (first_word, second_word)
    
    def correct_sentence(self, sentence):
        """string -> string"""
        tokenized_sentence = nltk.word_tokenize(sentence)
        for i in range(len(tokenized_sentence) - 1):
            bigram = tokenized_sentence[i:i+2]
            corrected_first, _ = self.correct_bigram(bigram)
            if corrected_first != tokenized_sentence[i]:
                tokenized_sentence[i] = corrected_first
        
        print(" ".join(tokenized_sentence))  
        
            

In [162]:
c = Correction_Tool()
c.correct_sentence("large man thing")
c.correct_sentence("expecting following year")
c.correct_sentence("wrong young people")
c.correct_sentence("earth war in history with inner revenue")

Corrected: large man to big man
big man thing
Corrected: following year to next year
expecting next year
young people is already a known collocation
wrong young people
Corrected: earth war to world war
Corrected: inner revenue to internal revenue
world war in history with internal revenue
