<a href="https://colab.research.google.com/github/MSaber7/Machine-Learning/blob/master/SR-NGram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SR | Lab Three : N-Gram

Implement an N-Gram language model that can separate true sentences from the artificially obtained sentences.

In [0]:
import numpy as np
import pandas as pd
ALPHA = 0.01

In [0]:
class LanguageModel(object):
    """
        n-gramm model
    """

    def __init__(self, ngram_size=2):

        if ngram_size < 2:
            raise Exception

        self.ngram_size = ngram_size

        #keys of dictionary are all words that was read by model, values are their ids (tokens)
        self.dictionary = {}
        self.number_of_words = 0

        #counters has n-grams and {n-1)-grams as keys and number of their occurances in train set as values
        self.counter = {}
        self.context_counter = {}

        self.smoothing = 'laplace'

    def fit(self, sentences):
        """
            Model training on sentence-splitted text
            :param sentences: the list of sentences
        """
        for sentence in sentences:
            self.fit_sentence(self.tokenize_sentence(sentence))

    def tokenize_sentence(self, sentence):
        """
            Getting the list of tokens by the sentence
            :return: tokenized sentence
        """

        #TODO: Your code for task #4

        sentence = sentence.split(" ")

        sentence.insert(0, '^')
        sentence.insert(-1, '$')

        result = []

        for word in sentence:
            token = self.dictionary.get(word)

            #if word is not in dictionary, then we should add it and set a token to it
            if token is None:
                token = self.number_of_words
                self.dictionary.setdefault(word, token)
                self.number_of_words = self.number_of_words + 1

            result.append(token)

        return result

    def fit_sentence(self, sentence):
        """
            Fitting a sentence to a model
        """

        l = len(sentence)

        #we should count ever n-gram in the sentence
        for i in range(l - self.ngram_size + 1):
            ngram = tuple(sentence[i: i + self.ngram_size])
            val = self.counter.get(ngram, 0) + 1
            self.counter.update([(ngram, val)])

        #... and do the same with {n-1}-grams, which are contexts for n-grams
        #TODO: Your code for task #1. Count the occurences in self.context_counter

        for j in range(l - self.ngram_size + 2):
            n_1gram = tuple(sentence[j: j + self.ngram_size - 1])
            val = self.context_counter.get(n_1gram,0) + 1
            self.context_counter.update([(n_1gram,val)])

    def ngram_prob(self, ngram):
        """
            Counting the probability of n-gram by knowing the context
        """
        if(self.smoothing == 'laplace'):

            #context for a n-gram is this n-gram without last word (token)
            context = ngram[:-1]

            #amount of unique {n-1}-grams
            V = len(self.context_counter.keys())

            #amount of occurances of given n-gram and its context in train set
            ngram_count = self.counter.get(ngram, 0)
            context_count = self.context_counter.get(context, 0)

            #TODO: Your code for task #2
            ngram_prob = (ngram_count + ALPHA)/(context_count + ALPHA * V)

            return ngram_prob

    def sentence_logprob(self, sentence):
        """
            Counting the log of probability of the given sentence as sum of log probabilities of its n-grams
        """

        sentence = self.tokenize_sentence(sentence)

        l = len(sentence)
        logprob = 0

        #TODO: Your code for task #3
        for i in range(l - self.ngram_size + 1):
            gram = tuple(sentence[i: i + self.ngram_size])
            tem_logprob = np.log(self.ngram_prob(gram))
            logprob += tem_logprob

        return logprob

    def log_prob(self, sentences):
        return [self.sentence_logprob(sentence) for sentence in sentences]

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [6]:
%cd/content/drive/My Drive/Colab Notebooks/SR/Lab3

/content/drive/My Drive/Colab Notebooks/SR/Lab3


In [8]:
df_train = pd.read_csv("train.tsv", sep='\t')
df_test = pd.read_csv("task.tsv", sep='\t')

print(df_train.head(2))
print("Read ", df_train.shape, df_test.shape)


   id                                               text
0   0  старый запустить палаццо с высокий лепной плаф...
1   1       на угол он встретить спешить ночное извозчик
Read  (15119, 2) (7048, 3)


In [9]:
basic_lm = LanguageModel()

sentences_train = df_train["text"].tolist()
basic_lm.fit(sentences=sentences_train)

print("Trained")

Trained


In [0]:
test1, test2 = df_test["text1"], df_test["text2"]

logprob1, logprob2 = np.array(basic_lm.log_prob(test1)), np.array(basic_lm.log_prob(test2))

In [0]:
res = pd.DataFrame()
res["id"] = df_test["id"]
res["which"] = 0
res.loc[logprob2 >= logprob1, ["which"]] = 1

res.to_csv("submission.csv", sep=",", index=None, columns=["id", "which"])

In [12]:
sub = pd.read_csv("submission.csv")
print (sub)

        id  which
0        0      1
1        1      1
2        2      1
3        3      0
4        4      0
...    ...    ...
7043  7043      0
7044  7044      0
7045  7045      1
7046  7046      0
7047  7047      0

[7048 rows x 2 columns]
