# Statistics

## Installs

In [1]:
!pip install tabulate



You should consider upgrading via the 'c:\users\mirun\anaconda3\python.exe -m pip install --upgrade pip' command.


## Data representation

In [3]:
from numpy import loadtxt
lines = loadtxt('train_full.txt', dtype='str', delimiter='\t')

labels = ['id', 'sentence', 'start', 'end', 'target', 'native', 'non-native', 'native_score', 'non-native_score', 'label']

data = [dict(zip(labels, line)) for line in lines]
data

[{'id': '1',
  'sentence': "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
  'start': '4',
  'end': '10',
  'target': 'barren',
  'native': '10',
  'non-native': '10',
  'native_score': '6',
  'non-native_score': '2',
  'label': '0.4'},
 {'id': '2',
  'sentence': "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
  'start': '4',
  'end': '18',
  'target': 'barren islands',
  'native': '10',
  'non-native': '10',
  'native_score': '0',
  'non-native_score': '1',
  'label': '0.05'},
 {'id': '3',
  'sentence': "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
  'start': '20',
  'end': '25',

In [3]:
word_targets = [line for line in data if len(line['target'].split()) == 1]
len(word_targets)

11949

## Make table

In [4]:
import numpy as np

def makeTable(features, scores, frequency, file):
    features = sorted(set([feature for feature in features if feature]))
    scores = list(sorted(set(scores)))

    info = [scores]
    for feature in features:
        line = []
        for score in scores:
            if (score, feature) not in frequency.keys():
                frequency[(score, feature)] = 0
            line.append(freq[(score, feature)])
        info.append(line)

    table = tabulate(info, headers='firstrow', showindex=features)
    with open(file, 'w') as f:
        f.write(table)

In [5]:
def getScores(target):
    scores = []

    for line in target:
        scores.append(line['label'])
    
    return scores

scores = getScores(word_targets)

In [6]:
def getFreq(features, scores, target):
    freq = {}
    i = 0
    j = 0

    for line in target:
        score = scores[i]
        feature = features[j]

        if (score, feature) not in freq.keys():
            freq[(score, feature)] = 0

        freq[(score, feature)] = freq[(score, feature)] + 1

        i += 1
        j += 1

    return freq

## Get word length 

In [176]:
def getLength(target):
    freq = {}
    lengths = []

    for line in target:
        lengths.append(len(line['target']))
    return lengths

lengths = getLength(word_targets)
freq = getFreq(lengths, scores, word_targets)
makeTable(lengths, scores, freq, 'lengths-score-freq.txt')

## Get word dependency

In [None]:
import stanza
nlp_stanza = stanza.Pipeline('en')

In [140]:
doc = nlp_stanza(word_targets[0]['sentence'])
print(*[f'word: {word.text}\tdeprel: {word.deprel}\n' for sent in doc.sentences for word in sent.words])

word: The	deprel: det
 word: barren	deprel: amod
 word: islands	deprel: nsubj:pass
 word: ,	deprel: punct
 word: reefs	deprel: conj
 word: and	deprel: cc
 word: coral	deprel: amod
 word: outcrops	deprel: conj
 word: are	deprel: aux:pass
 word: believed	deprel: root
 word: to	deprel: mark
 word: be	deprel: cop
 word: in	deprel: case
 word: rich	deprel: xcomp
 word: in	deprel: case
 word: oil	deprel: obl
 word: and	deprel: cc
 word: gas	deprel: conj
 word: and	deprel: cc
 word: the	deprel: det
 word: overlapping	deprel: amod
 word: claims	deprel: nsubj:pass
 word: have	deprel: aux
 word: long	deprel: advmod
 word: been	deprel: aux:pass
 word: feared	deprel: conj
 word: as	deprel: case
 word: Asia	deprel: nmod:poss
 word: 's	deprel: case
 word: next	deprel: amod
 word: flashpoint	deprel: obl
 word: for	deprel: case
 word: armed	deprel: amod
 word: conflict	deprel: nmod
 word: .	deprel: punct



In [145]:
def getDep(line):
    doc = nlp_stanza(line['sentence'])
    pos = 0
    for sent in doc.sentences:
        for word in sent.words:
            if pos >= int(line['start']):
                return word.deprel
            pos += len(word.text) + 1

dep = []
for line in word_targets[:18]:
    dep.append(getDep(line))
#     print(f"{line['target']} {getDep(line)}")

barren amod
reefs conj
islands nsubj:pass
coral amod
outcrops conj
overlapping amod
believed root
rich xcomp
oil obl
gas conj
flashpoint obl
claims nsubj:pass
long advmod
feared conj
Asia nmod:poss
conflict nmod
armed amod
flexed root


## POS tagging

In [154]:
def getWordID(start, sentence):
    pos = 0
    for i in range(len(sentence)):
        if i >= start:
            return pos
        if sentence[i] == ' ':
            pos += 1
    return pos

In [191]:
from nltk import pos_tag, word_tokenize

def getPOS(line):
    pos = getWordID(int(line['start']), line['sentence'])
    text = pos_tag(word_tokenize(line['sentence']))
    for i in range(len(text)):
        if i >= pos and text[i][0] == line['target']:
            return text[i][1]

In [192]:
pos = []
for line in word_targets:
    pos.append(getPOS(line))

In [194]:
set(pos)

{'CC',
 'CD',
 'FW',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'MD',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 None,
 'RB',
 'RBR',
 'RP',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ'}

In [199]:
freq = getFreq(pos, scores, word_targets)
makeTable(pos, scores, freq, 'pos-score-freq.txt')

# Basic libraries

In [7]:
one_word_lines = [line[:5] for line in lines if len(line[4].split()) == 1]
one_word_lines

[array(['1',
        "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
        '4', '10', 'barren'], dtype='<U657'),
 array(['3',
        "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
        '20', '25', 'reefs'], dtype='<U657'),
 array(['4',
        "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
        '11', '18', 'islands'], dtype='<U657'),
 array(['5',
        "The barren islands, reefs and coral outcrops are believed to be in rich in oil and gas and the overlapping claims have long been feared as Asia's next flashpoint for armed conflict.",
        '30', '35', 'coral'], dtype=

# Solve class

In [1]:
from numpy import loadtxt
lines = loadtxt('train_full.txt', dtype='str', delimiter='\t')

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

class Solve:
    tfidf_vectorizer = TfidfVectorizer(lowercase = False)

    def __init__(self, data, labels):
        self.splitTrainTest(self.getDataFrame(self.preprocess(data), labels))    
        self.vectorize()
        self.getScores()

    def preprocess(self, data):
        # variabil
        return data;
    
    def getDataFrame(self, data, labels):
        data_df = pd.DataFrame(data, columns = ['data'])
        data_df['label'] = labels
        return data_df

    def splitTrainTest(self, data_df):
        self.train_df, self.test_df = train_test_split(data_df, test_size=0.2, shuffle = True)
        
    def vectorizeData(self, data_df):
        # variabil
        X_data = self.tfidf_vectorizer.transform(data_df['data'])
        y_data = data_df['label']

        return X_data, y_data
    
    def vectorize(self):
        tfidf_representation = self.tfidf_vectorizer.fit(self.train_df['data'])

        self.X_train, self.y_train = self.vectorizeData(self.train_df)
        self.X_test, self.y_test = self.vectorizeData(self.test_df)
    
    def train(self, model):
        model.fit(self.X_train, self.y_train)
        y_predict = model.predict(self.X_train)

        return self.mae(self.y_train, model.predict(self.X_train)), self.mae(self.y_test, model.predict(self.X_test))
    
    def mae(self, y_true, y_pred):
        return np.mean(np.abs(np.array(y_true, dtype='float') - np.array(y_pred, dtype='float')))

    def printScore(self, model, model_name):
        train_score, test_score = self.train(model)
        
        print(f'{model_name}: \n' + \
             f'Train MAE: {train_score} \n' + \
             f'Test MAE: {test_score} \n')
        
    def getScores(self):
        self.printScore(LogisticRegression(), "Logistic Regression")
        self.printScore(SVR(C=10, kernel='linear'), "SVR")

        # de adaugat mai multe modele de train

In [8]:
solutions = Solve([line[4] for line in lines], [line[9] for line in lines])

Logistic Regression
Train Score 0.640835639675029
Test Score 0.6115672973937879


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
