# Statistics

## Installs

In [None]:
!pip install tabulate

## Data representation

In [None]:
from numpy import loadtxt
lines = loadtxt('train_full.txt', dtype='str', delimiter='\t')

labels = ['id', 'sentence', 'start', 'end', 'target', 'native', 'non-native', 'native_score', 'non-native_score', 'label']

data = [dict(zip(labels, line)) for line in lines]
data

In [None]:
word_targets = [line for line in data if len(line['target'].split()) == 1]
len(word_targets)

## Make table

In [None]:
import numpy as np

def makeTable(features, scores, frequency, file):
    features = sorted(set([feature for feature in features if feature]))
    scores = list(sorted(set(scores)))

    info = [scores]
    for feature in features:
        line = []
        for score in scores:
            if (score, feature) not in frequency.keys():
                frequency[(score, feature)] = 0
            line.append(freq[(score, feature)])
        info.append(line)

    table = tabulate(info, headers='firstrow', showindex=features)
    with open(file, 'w') as f:
        f.write(table)

In [None]:
def getScores(target):
    scores = []

    for line in target:
        scores.append(line['label'])
    
    return scores

scores = getScores(word_targets)

In [None]:
def getFreq(features, scores, target):
    freq = {}
    i = 0
    j = 0

    for line in target:
        score = scores[i]
        feature = features[j]

        if (score, feature) not in freq.keys():
            freq[(score, feature)] = 0

        freq[(score, feature)] = freq[(score, feature)] + 1

        i += 1
        j += 1

    return freq

## Get word length 

In [None]:
def getLength(target):
    freq = {}
    lengths = []

    for line in target:
        lengths.append(len(line['target']))
    return lengths

lengths = getLength(word_targets)
freq = getFreq(lengths, scores, word_targets)
makeTable(lengths, scores, freq, 'lengths-score-freq.txt')

## Get word dependency

In [None]:
import stanza
nlp_stanza = stanza.Pipeline('en')

In [None]:
doc = nlp_stanza(word_targets[0]['sentence'])
print(*[f'word: {word.text}\tdeprel: {word.deprel}\n' for sent in doc.sentences for word in sent.words])

In [None]:
def getDep(line):
    doc = nlp_stanza(line['sentence'])
    pos = 0
    for sent in doc.sentences:
        for word in sent.words:
            if pos >= int(line['start']):
                return word.deprel
            pos += len(word.text) + 1

dep = []
for line in word_targets[:18]:
    dep.append(getDep(line))
#     print(f"{line['target']} {getDep(line)}")

## POS tagging

In [None]:
def getWordID(start, sentence):
    pos = 0
    for i in range(len(sentence)):
        if i >= start:
            return pos
        if sentence[i] == ' ':
            pos += 1
    return pos

In [None]:
from nltk import pos_tag, word_tokenize

def getPOS(line):
    pos = getWordID(int(line['start']), line['sentence'])
    text = pos_tag(word_tokenize(line['sentence']))
    for i in range(len(text)):
        if i >= pos and text[i][0] == line['target']:
            return text[i][1]

In [None]:
pos = []
for line in word_targets:
    pos.append(getPOS(line))

In [None]:
set(pos)

In [None]:
freq = getFreq(pos, scores, word_targets)
makeTable(pos, scores, freq, 'pos-score-freq.txt')

In [None]:
print(w2v_model.wv.most_similar(positive='chinese'))
print(w2v_model.wv.most_similar(positive='family'))
print(w2v_model.wv.most_similar(positive='country'))
print(w2v_model.wv.most_similar(positive='attack'))

# Basic libraries

In [None]:
one_word_lines = [line[:5] for line in lines if len(line[4].split()) == 1]
one_word_lines

# Solve problem

In [1]:
from numpy import loadtxt
lines = loadtxt('train_full.txt', dtype='str', delimiter='\t')
# linesTest = loadtxt('test.txt', dtype='str', delimiter='\t')
linesTest = loadtxt('News_Dev.tsv', dtype='str', delimiter='\t')

# Vectorizers

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize, RegexpTokenizer
import pandas as pd
import numpy as np


class Vectorizer():
    vector_size = 300

    def __init__(self, model, train_df, test_df):
        self.model = model

        if model == 'tfidf':
            self.makeDatasetFromWords()

            self.vectorizer = TfidfVectorizer(lowercase = False)
            self.vectorizer.fit(self.dataset)

        elif model == 'word2vec':
            self.makeDatasetFromSentences()

            self.vectorizer = Word2Vec(sentences=self.dataset, vector_size=300, window=5, min_count=1, workers=16)
            self.vectorizer.save("word2vec.model")

        elif model == 'fastText':
            self.makeDatasetFromSentences()
            
            self.vectorizer = FastText(vector_size=300, window=5, min_count=1)
            self.vectorizer.build_vocab(corpus_iterable=self.dataset)
            self.vectorizer.train(corpus_iterable=self.dataset, total_examples=len(self.dataset), epochs=10)

        elif model == 'semantic':
            self.makeDatasetFromWords()


        self.vectorize(train_df, test_df)


    def makeDatasetFromSentences(self):
        self.dataset = np.array(list(set([line[1].lower() for line in lines] + [line[1].lower() for line in linesTest])))
        self.preprocess()


    def makeDatasetFromWords(self):
        self.dataset = np.array(list(set(
            [line[1].lower() for line in lines] + [line[1].lower() for line in linesTest] + \
            [line[4].lower() for line in lines] + [line[4].lower() for line in linesTest] \
        )))
        self.preprocess()


    def preprocess(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z\'-]+')
        self.dataset = np.array(list(set(np.hstack([self.tokenizePhrase(phrase) for phrase in self.dataset]))))


    def tokenizePhrase(self, phrase):
        tokenizer = RegexpTokenizer(r'[a-zA-Z\'-]+')
        return word_tokenize(' '.join(tokenizer.tokenize(phrase)))


    def vectorize(self, train_df, test_df):
        if self.model == 'tfidf':
            self.X_train = self.vectorizer.transform(train_df['data']).toarray()
            self.X_test = self.vectorizer.transform(test_df['data']).toarray()


        elif self.model in ['word2vec', 'fastText']:
            self.X_train = np.array([self.vectorizePhrase(phrase, self.vectorizer.wv.get_vector) for phrase in train_df['data']])
            self.X_test = np.array([self.vectorizePhrase(phrase, self.vectorizer.wv.get_vector) for phrase in test_df['data']])

        self.y_train = np.array(train_df['label'], dtype='float')
        self.y_test = np.array(test_df['label'], dtype='float')


    def vectorizePhrase(self, phrase, func):
        tokenizer = RegexpTokenizer(r'[a-zA-Z\'-]+')
        vectorizations = np.array([func(word) for word in self.tokenizePhrase(phrase)])

        return np.mean(vectorizations, axis=0)

# Test class

In [11]:
def getTrainData():
    return [line[4].lower() for line in lines]

def getTrainLabel():
    return [line[9] for line in lines]

def getTestData():
    return [line[4].lower() for line in linesTest]

def getTestLabel():
    return [line[10] for line in linesTest]

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

from matplotlib.pyplot import hist


class Solve:
    def __init__(self, model):
        self.getDataFrame()
#         self.splitTrainTest()
#         features_train = self.getFeatures(getTrainData())
#         features_test = self.getFeatures(getTestData())

        vectorizer = Vectorizer(model, self.train_df, self.test_df)

        self.X_train = vectorizer.X_train
        self.X_test = vectorizer.X_test

        self.y_train = vectorizer.y_train
        self.y_test = vectorizer.y_test

#         self.X_train = np.c_[self.X_train, features_train.reshape(-1,1)]
#         self.X_test = np.c_[self.X_test, features_test.reshape(-1,1)]

        self.y_train = self.y_train.reshape(-1, 1)
        self.y_test = self.y_test.reshape(-1, 1)

        self.getScores()
#         self.findScore()


    def getFeatures(self, X_data):
        return np.array([np.mean([len(word) for word in phrase.split()]) for phrase in X_data])


    def getDataFrame(self):
        self.train_df = pd.DataFrame(getTrainData(), columns = ['data'])
        self.train_df['label'] = getTrainLabel()

        self.test_df = pd.DataFrame(getTestData(), columns = ['data'])
        self.test_df['label'] = getTestLabel()


    def splitTrainTest(self):
        data_df = pd.DataFrame(getTrainData(), columns = ['data'])
        data_df['label'] = getTrainLabel()

        self.train_df, self.test_df = train_test_split(data_df, test_size=0.2, shuffle = True)


    def scaleData(self, y_scaler):
        X_scaler = StandardScaler()
        X_train = X_scaler.fit_transform(self.X_train)
        X_test = X_scaler.transform(self.X_test)

        y_train = y_scaler.fit_transform(self.y_train)
        y_test = y_scaler.transform(self.y_test)

        return X_train, X_test, y_train, y_test


    # - Train --------------------------------------------------

    def train(self, model, to_scale):
        if to_scale:
            y_scaler = StandardScaler()
            X_train, X_test, y_train, y_test = self.scaleData(y_scaler)

            model.fit(X_train, y_train)
            return self.scaled_mae(y_train, model.predict(X_train), y_scaler), \
                self.scaled_mae(y_test, model.predict(X_test), y_scaler)

        else:
            model.fit(self.X_train, self.y_train)
            return mean_absolute_error(self.y_train, model.predict(self.X_train)), \
                mean_absolute_error(self.y_test, model.predict(self.X_test))


    def scaled_mae(self, y_true, y_pred, scaler):
        return mean_absolute_error(scaler.inverse_transform(y_true), scaler.inverse_transform(y_pred))

    
    def printScore(self, model, model_name, to_scale = True):
        train_score, test_score = self.train(model, to_scale)

        print(f'{model_name}: \n' + \
             f'Train MAE: {train_score} \n' + \
             f'Test MAE: {test_score} \n')
    

    def getScores(self):
#         self.printScore(LogisticRegression(), "Logistic Regression", False)
        self.printScore(LinearSVR(C=5), "Linear SVR")
        self.printScore(LinearSVR(C=5), "Linear SVR - no scaling", False)
        self.printScore(Ridge(alpha=1.0), "Ridge")
        self.printScore(Ridge(alpha=1.0), "Ridge - no scaling", False)
        self.printScore(KNeighborsRegressor(n_neighbors=5), f"kNN")
        self.printScore(KNeighborsRegressor(n_neighbors=5), f"kNN - no scaling", False)
#         self.printScore(RandomForestRegressor(n_estimators=1000, max_depth=300, random_state=0), "Forest - no scaling", False)
#         self.printScore(RandomForestRegressor(n_estimators=1000, max_depth=300, random_state=0), "Forest")


        # de adaugat mai multe modele de train
    

    def findScore(self):
        model = LinearSVR(C=5)
        model.fit(self.X_train, self.y_train)

        self.res = model.predict(self.X_test)
        np.savetxt('./submissions/submission3.csv', np.c_[[line[0] for line in linesTest], self.res], delimiter=',', header='id,label', comments='', fmt='%s')

In [None]:
solutions = Solve('tfidf')

  y = column_or_1d(y, warn=True)


Linear SVR: 
Train MAE: 0.05692751442837366 
Test MAE: 0.08428088050105406 



  y = column_or_1d(y, warn=True)


Linear SVR - no scaling: 
Train MAE: 0.038590334318351384 
Test MAE: 0.0675103547424574 

Ridge: 
Train MAE: 0.04595884036176265 
Test MAE: 0.07438219238378277 

Ridge - no scaling: 
Train MAE: 0.058861996181933614 
Test MAE: 0.07853867238235897 



In [10]:
solutions = Solve('word2vec')

KeyError: "Key 'barren' not present"