# Statistics

## Installs

In [None]:
!pip install tabulate

## Data representation

In [None]:
from numpy import loadtxt
lines = loadtxt('train_full.txt', dtype='str', delimiter='\t')

labels = ['id', 'sentence', 'start', 'end', 'target', 'native', 'non-native', 'native_score', 'non-native_score', 'label']

data = [dict(zip(labels, line)) for line in lines]
data

In [None]:
word_targets = [line for line in data if len(line['target'].split()) == 1]
len(word_targets)

## Make table

In [None]:
import numpy as np
from tabulate import tabulate

def makeTable(features, scores, frequency, file):
    features = sorted(set([feature for feature in features if feature]))
    scores = list(sorted(set(scores)))

    info = [scores]
    for feature in features:
        line = []
        for score in scores:
            if (score, feature) not in frequency.keys():
                frequency[(score, feature)] = 0
            line.append(freq[(score, feature)])
        info.append(line)

    table = tabulate(info, headers='firstrow', showindex=features)
    with open(file, 'w') as f:
        f.write(table)

In [None]:
def getScores(target):
    scores = []

    for line in target:
        scores.append(line['label'])
    
    return scores

scores = getScores(word_targets)

In [None]:
def getFreq(features, scores, target):
    freq = {}
    i = 0
    j = 0

    for line in target:
        score = scores[i]
        feature = features[j]

        if (score, feature) not in freq.keys():
            freq[(score, feature)] = 0

        freq[(score, feature)] = freq[(score, feature)] + 1

        i += 1
        j += 1

    return freq

## Get word length 

In [None]:
def getLength(target):
    freq = {}
    lengths = []

    for line in target:
        lengths.append(len(line['target']))
    return lengths

lengths = getLength(word_targets)
freq = getFreq(lengths, scores, word_targets)
makeTable(lengths, scores, freq, 'statistics/lengths-score-freq.txt')

## Get word dependency

In [None]:
import stanza
nlp_stanza = stanza.Pipeline('en')

In [None]:
doc = nlp_stanza(word_targets[0]['sentence'])
print(*[f'word: {word.text}\tdeprel: {word.deprel}\n' for sent in doc.sentences for word in sent.words])

In [None]:
def getDep(line):
    doc = nlp_stanza(line['sentence'])
    pos = 0
    for sent in doc.sentences:
        for word in sent.words:
            if pos >= int(line['start']):
                return word.deprel
            pos += len(word.text) + 1

dep = []
for line in word_targets[:18]:
    dep.append(getDep(line))

## POS tagging

In [None]:
def getWordID(start, sentence):
    pos = 0
    for i in range(len(sentence)):
        if i >= start:
            return pos
        if sentence[i] == ' ':
            pos += 1
    return pos

In [None]:
from nltk import pos_tag, word_tokenize

def getPOS(line):
    pos = getWordID(int(line['start']), line['sentence'])
    text = pos_tag(word_tokenize(line['sentence']))
    for i in range(len(text)):
        if i >= pos and text[i][0] == line['target']:
            return text[i][1]

In [None]:
pos = []
for line in word_targets:
    pos.append(getPOS(line))

In [None]:
set(pos)

In [None]:
freq = getFreq(pos, scores, word_targets)
makeTable(pos, scores, freq, 'statistics/pos-score-freq.txt')

# Solve problem

In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')

In [None]:
from numpy import loadtxt

lines = loadtxt('train_full.txt', dtype='str', delimiter='\t')
linesTest = loadtxt('test.txt', dtype='str', delimiter='\t')

# Dataframe

In [None]:
def getTrainData():
    return [line[4].lower() for line in lines]


def getTrainLabel():
    return [line[9] for line in lines]


def getTestData():
    return [line[4].lower() for line in linesTest]


def getTestLabel():
    return [line[10] for line in linesTest]


def getDataFrame():
    train_df = pd.DataFrame(getTrainData(), columns = ['data'])
    train_df['label'] = getTrainLabel()

    test_df = pd.DataFrame(getTestData(), columns = ['data'])
#     test_df['label'] = getTestLabel()
    
    return train_df, test_df


def splitTrainTest():
    data_df = pd.DataFrame(getTrainData(), columns = ['data'])
    data_df['label'] = getTrainLabel()

    return train_test_split(data_df, test_size=0.2, shuffle = True)

# Vectorizers

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize, RegexpTokenizer
import pandas as pd
import numpy as np


class Vectorizer():
    vector_size = 300

    def __init__(self, model, train_df, test_df):
        self.model = model

        if model == 'tfidf':
            self.makeDatasetFromWords()

            self.vectorizer = TfidfVectorizer(lowercase = False)
            self.vectorizer.fit(self.dataset)

        elif model == 'word2vec':
            self.makeDatasetFromSentences()

            self.vectorizer = Word2Vec(sentences=self.dataset, vector_size=300, window=5, min_count=1, workers=16)
            self.vectorizer.save("word2vec.model")

        elif model == 'semantic':
            self.makeDatasetFromWords()


        self.vectorize(train_df, test_df)


    def makeDatasetFromSentences(self):
        brownData = [' '.join(sentence).lower() for sentence in brown.sents()]
        self.dataset = np.array(list(set(
            [line[1].lower() for line in lines] + [line[1].lower() for line in linesTest] + brownData\
        )))
        self.preprocessSentences()


    def makeDatasetFromWords(self):
        self.dataset = np.array(list(set(
            [line[1].lower() for line in lines] + [line[1].lower() for line in linesTest]\
        )))
        self.preprocessWords()


    def preprocessWords(self):
        self.dataset = np.array(np.hstack([self.tokenizePhrase(phrase) for phrase in self.dataset]))


    def preprocessSentences(self):
        tokens = set([' '.join(self.tokenizePhrase(phrase)) for phrase in self.dataset])
        self.dataset = np.array([tokensList.split() for tokensList in tokens])


    def tokenizePhrase(self, phrase):
        tokenizer = RegexpTokenizer(r'[a-zA-Z\'-]+')
        return word_tokenize(' '.join(tokenizer.tokenize(phrase)))


    def vectorize(self, train_df, test_df):
        if self.model == 'tfidf':
            self.X_train = self.vectorizer.transform(train_df['data']).toarray()
            self.X_test = self.vectorizer.transform(test_df['data']).toarray()


        elif self.model == 'word2vec':
            self.X_train = np.array([self.vectorizePhrase(phrase, self.vectorizer.wv.get_vector) for phrase in train_df['data']])
            self.X_test = np.array([self.vectorizePhrase(phrase, self.vectorizer.wv.get_vector) for phrase in test_df['data']])
            
        elif self.model == 'semantic':
            self.X_train = np.array([self.vectorizePhrase(phrase, self.extractFeatures) for phrase in train_df['data']])
            self.X_test = np.array([self.vectorizePhrase(phrase, self.extractFeatures) for phrase in test_df['data']])
            
        self.y_train = np.array(train_df['label'], dtype='float')
#         self.y_test = np.array(test_df['label'], dtype='float')


    def vectorizePhrase(self, phrase, func):
        tokenizer = RegexpTokenizer(r'[a-zA-Z\'-]+')
        vectorizations = np.array([func(word) for word in self.tokenizePhrase(phrase)])

        return np.mean(vectorizations, axis=0)


    def extractFeatures(self, word):
        vowels = sum(map(word.count, 'aeiou'))
        length = len(word)
        doubleLetters = sum([word[i - 1] == word[i] for i in range(1, length)])
        
        maxConsecutiveConsonants = 0
        localSum = 0
        for l in word:
            if l not in 'aeiou':
                maxConsecutiveConsonants = max(maxConsecutiveConsonants, localSum)
                localSum = 0
            else:
                localSum += 1
        maxConsecutiveConsonants = max(maxConsecutiveConsonants, localSum)

        return [length, vowels / length, doubleLetters, maxConsecutiveConsonants]

In [None]:
train_df, test_df = getDataFrame()

tfidf = Vectorizer('tfidf', train_df, test_df)
w2v = Vectorizer('word2vec', train_df, test_df)
features = Vectorizer('semantic', train_df, test_df)

# Test class

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR, LinearSVR, NuSVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

from matplotlib.pyplot import hist


class Solve:
    def __init__(self, vectorizer):
        getDataFrame()
#         self.splitTrainTest()

        self.X_train = vectorizer.X_train
        self.X_test = vectorizer.X_test

        self.y_train = vectorizer.y_train
#         self.y_test = vectorizer.y_test

        self.X_train = np.c_[self.X_train, features.X_train]
        self.X_test = np.c_[self.X_test, features.X_test]

        self.y_train = self.y_train.reshape(-1, 1)
#         self.y_test = self.y_test.reshape(-1, 1)

#         self.getScores()
#         self.findScore()


    def scaleData(self, y_scaler):
        X_scaler = StandardScaler()
        X_train = X_scaler.fit_transform(self.X_train)
        X_test = X_scaler.transform(self.X_test)

        y_train = y_scaler.fit_transform(self.y_train)
#         y_test = y_scaler.transform(self.y_test)

        return X_train, X_test, y_train


    # - Validation --------------------------------------------------

    def validation(self, model, to_scale):
        if to_scale:
            y_scaler = StandardScaler()
            X_train, X_test, y_train, y_test = self.scaleData(y_scaler)

            model.fit(X_train, y_train)
            return self.scaled_mae(y_train, model.predict(X_train), y_scaler), \
                self.scaled_mae(y_test, model.predict(X_test), y_scaler)

        else:
            model.fit(self.X_train, self.y_train)
            return mean_absolute_error(self.y_train, model.predict(self.X_train)), \
                mean_absolute_error(self.y_test, model.predict(self.X_test))


    def scaled_mae(self, y_true, y_pred, scaler):
        return mean_absolute_error(scaler.inverse_transform(y_true), scaler.inverse_transform(y_pred))

    
    def printScore(self, model, model_name, to_scale = True):
        train_score, test_score = self.validation(model, to_scale)

        print(f'{model_name}: \n' + \
             f'Train MAE: {train_score} \n' + \
             f'Test MAE: {test_score} \n')


    # - Train -------------------------------------------------------

    def train(self, model, to_scale):
        if to_scale:
            y_scaler = StandardScaler()
            X_train, X_test, y_train = self.scaleData(y_scaler)

            model.fit(X_train, y_train)
            self.res = y_scaler.inverse_transform(model.predict(X_test))

        else:
            model.fit(self.X_train, self.y_train)
            self.res = model.predict(self.X_test)

        return np.c_[[line[0] for line in linesTest], self.res]


    def saveScore(self, file, solution):
        np.savetxt('./submissions/' + file, solution, delimiter=',', header='id,label', comments='', fmt='%s')

In [None]:
solution_w2v = Solve(w2v)

In [None]:
solutions = Solve(tfidf)

# Train and save best scores

In [None]:
sol_w2v_knn_scaling = solution_w2v.train(KNeighborsRegressor(n_neighbors=2), True)
sol_w2v_knn = solution_w2v.train(KNeighborsRegressor(n_neighbors=2), False)

In [None]:
sol_w2v_lr = solution_w2v.train(LogisticRegression(), False)

In [None]:
sol_w2v_nusvr = solution_w2v.train(NuSVR(C=10, coef0=1.0), True)

In [None]:
sol_w2v_lsvr = solution_w2v.train(LinearSVR(C=5), True)

In [None]:
sol_w2v_ridge = solution_w2v.train(Ridge(alpha=1.0), True)

In [None]:
sol_w2v_rfr = solution_w2v.train(RandomForestRegressor(n_estimators=1000, max_depth=300, random_state=0), True)

In [None]:
solution_w2v.saveScore('submission5.csv', sol_w2v_knn)