In [1]:
# Setup
# !pip install pyphen nltk pandas sklearn
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')


In [1]:
import pyphen
import cupy
import re
import numpy as np
import pandas as pd

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

from dale_chall import DALE_CHALL

import os


In [2]:
dtypes = {"sentence": "string", "token": "string", "complexity": "float64"}
train = pd.read_excel(os.path.join(os.getcwd(), "data",
                      "train.xlsx"), dtype=dtypes, keep_default_na=False)
test = pd.read_excel(os.path.join(os.getcwd(), "data",
                     "test.xlsx"), dtype=dtypes, keep_default_na=False)
#print('train data: ', train.shape)
#print('test data: ', test.shape)


In [4]:
train[train["complex"] == False].to_csv("simple.csv")


In [3]:
def nr_syllables(word):
    import re
    return len(
        re.findall('(?!e$)[aeiouy]+', word, re.I) +
        re.findall('^[^aeiouy]*e$', word, re.I)
    )


def is_dale_chall(word):
        return (word.lower() in DALE_CHALL) or (word[:-1].lower() in DALE_CHALL) or (word[:-2].lower() in DALE_CHALL)


def length(word):
    return len(word.replace(" ", ""))


def nr_vowels(word):
    vowels = ["a", "e", "o", "u", "y"]
    nr_vowels = 0
    word = word.lower()
    for vowel in vowels:
        nr_vowels += word.count(vowel)
    return nr_vowels


def is_title(word):
    if word == word.capitalize():
        return 1
    else:
        return 0


def abreviation(word):
    if word == word.upper():
        return 1
    else:
        return 0


def repeating_characters(word):
    chars = "abcdefghijklmnopqrstuvwxyz"
    word = word.lower()
    characters = 0
    for char in chars:
        count = word.count(char)
        if count > 1:
            characters += 1
    return characters


def get_word_structure_features(word):
    features = []
    features.append(nr_syllables(word))
    features.append(is_dale_chall(word))
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(is_title(word))
    features.append(abreviation(word))
    features.append(repeating_characters(word))
    return np.array(features)


def get_wordnet_features(word):
    features = []
    features.append(len(wordnet.synsets(word)))
    # try:
    #     features.append(len(wordnet.synsets(word)[0].definition()))
    # except:
    #     features.append(0)
    # try:
    #     features.append(len(wordnet.synsets(word)[0].examples()))
    # except:
    #     features.append(0)
    return np.array(features)


def get_position_in_sentence(sentence, word):
    try:
        cp_sentence = sentence.lower()
        cp_word = word.lower()
        words = re.findall(r'\w+', cp_sentence)
        position = words.index(cp_word)
        return [int(position/len(words)) * 10]
    except ValueError:
        words = sentence.split(' ')
        for x in range(len(words)):
            if word in words[x]:
                return [int(x/len(words)) * 10]


def complex_sentence(sentence):
    for x in sentence:
        if(ord(x) > 127):
            return [True]
    return [False]


def mean_complexity_sentence(sentence):
    array = []
    for x in sentence:
        array.append(get_word_structure_features(x))
    return [np.array(array).mean()]


def unique_words(sentence):
    dictionar = {}
    for x in sentence:
        try:
            dictionar[x] += 1
        except:
            dictionar[x] = 1
    return [len(dictionar)]


def corpus_feature(corpus):
    corp_dict = {}
    corp_dict['bible'] = [0]
    corp_dict['europarl'] = [1]
    corp_dict['biomed'] = [2]
    return corp_dict[corpus]


row = train.iloc[:1]
# print(corpus_feature(row['corpus'])[0])


def featurize(row):
    word = row['token']
    all_features = []
    all_features.extend(corpus_feature(row['corpus']))
    all_features.extend(get_word_structure_features(word))
    all_features.extend(get_wordnet_features(word))
    #all_features.extend(complex_sentence(row['sentence']))
    #all_features.extend(mean_complexity_sentence(row['sentence']))
    #all_features.extend(unique_words(row['sentence'])) #extremly unimportant
    #all_features.extend(get_position_in_sentence(row['sentence'], word))
    return np.array(all_features)


def featurize_df(df):
    nr_of_features = len(featurize(df.iloc[0]))
    nr_of_examples = len(df)
    features = np.zeros((nr_of_examples, nr_of_features))
    for index, row in df.iterrows():
        row_ftrs = featurize(row)
        features[index, :] = row_ftrs
    return features


In [4]:
from sklearn.metrics import balanced_accuracy_score


def accuracy(y_true: np.array, y_pred: np.array):
    return np.sum((y_true == y_pred).astype(int)) / len(y_true) * 100


In [5]:
arrays_of_indexes = []
for x in train["corpus"].unique():
    arrays_of_indexes.append(train.loc[train['corpus'] == x].index)


def generate_data(percentage=15):
    chosen_idx = []
    for array in arrays_of_indexes:
        chosen_idx.extend(np.random.choice(
            array, replace=False, size=int(percentage/100*len(array))))
    new_test = train.iloc[chosen_idx]
    new_train = train.drop(chosen_idx)
    new_train.reset_index(drop=True, inplace=True)
    new_test.reset_index(drop=True, inplace=True)
    X_train = featurize_df(new_train)
    y_train = new_train['complex'].values
    X_test = featurize_df(new_test)
    y_test = new_test['complex'].values
    return X_train, y_train, X_test, y_test


In [6]:
def classify_image(train_images, train_labels, test_image, num_neighbors=3):
    train_images = cupy.array(train_images)
    train_labels = cupy.array(train_labels)
    test_image = cupy.array(test_image)
    dist = cupy.sqrt(((test_image-train_images)**2).sum(axis=1))
    indices = dist.argsort()
    labels_knn = train_labels[indices[:num_neighbors]]
    cnt = cupy.bincount(labels_knn)
    label = cnt.argmax()
    return label


def knn_predicitons_cupy(X_train, y_train, X_test, neighbours):
    preds = cupy.zeros(len(X_test))
    for i in range(len(X_test)):
        preds[i] = classify_image(X_train, y_train, X_test[i], neighbours)
    return preds


X_train, y_train, X_test, y_test = generate_data()
preds = knn_predicitons_cupy(X_train, y_train, X_test, 10)


KeyboardInterrupt: 

In [9]:
def knn_predicitons(X_train, y_train, X_test, neighbours):
    model = KNeighborsClassifier(n_neighbors=neighbours)
    model.fit(X_train, y_train)
    return model.predict(X_test)


for nb in [1, 3, 5, 7]:
    scor = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data(5)
        preds = knn_predicitons(X_train, y_train, X_test, nb)
        scor.append(balanced_accuracy_score(y_test, preds))
    print(nb, np.array(scor).mean())

""" Submission 3
knn 3 neighbours 

100 random train data of 15% of total

features.append(nr_syllables(word))
    features.append(is_dale_chall(word)) // without plural bonus
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(is_title(word))
    features.append(abreviation(word))
    features.append(repeating_characters(word))

default row features

1 0.668427876407733
2 0.6126221801373262
3 0.6765161869511589
4 0.6386154857166626
5 0.6634404495232458
6 0.6355640755172113
7 0.6488212568854519
8 0.6259146558693992
9 0.6397803876068514
10 0.6189791332207973
11 0.6285574131056153
12 0.6162813529025942
13 0.6258473169720228
14 0.6120803909664643

17m 20s


Submission 4
added plural to dale, very slight increase
1 0.668257413183145
2 0.6126109670655668
3 0.6774073128977224
4 0.6339482434381662
5 0.6680421902150359
6 0.6356128388286275
7 0.6540611829398212
8 0.6241868282579967
9 0.6428127641844336
10 0.6171953886598779
11 0.6298598263970768
12 0.618206900202975
13 0.6238640555630808
14 0.6136176874161184


add position and complex
1 0.6775388554327088
2 0.6166385320256123
3 0.6824216345750901
4 0.6372908738125417
5 0.6681370214586604
6 0.6330719335337368

add length of definition
1 0.7142974520544537
2 0.6392006863590715
3 0.7185837970831896
4 0.6562322581550614
5 0.6767755930256228
6 0.638215011403038
7 0.6478298861631056
8 0.6014318744187663
9 0.6113716172856284

add mean complexity of each word
1 0.6645660311458911
3 0.6825219784240113
5 0.6532568645394083
7 0.6348892638844663

remove position
1 0.6756303700338743
3 0.6742453028316306
5 0.6545457828350185
7 0.6446028971371262
"""


1 0.6826425240420049
3 0.6848066180544582


KeyboardInterrupt: 

In [8]:
from sklearn.svm import SVC

In [23]:
for i in np.arange(0.01,  1, 0.025):
    scores = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data(20)
        clf = SVC(kernel = 'rbf', C = 3.5, gamma = i, class_weight = 'balanced')
        preds = clf.fit(X_train, y_train).predict(X_test)
        scor = balanced_accuracy_score(y_test, preds)
        scores.append(scor)
    print(i, scor.mean())
"""
C
2.0 0.7989526768510238
2.05 0.7663162573389564
2.0999999999999996 0.8071139172438091
2.1499999999999995 0.7817640548070683
2.1999999999999993 0.7849235827168971
2.249999999999999 0.8002323036630431
2.299999999999999 0.8006677849020192
2.3499999999999988 0.8026971693074595
2.3999999999999986 0.7829305179191633
2.4499999999999984 0.8000819070214088
2.4999999999999982 0.8144846512576605
2.549999999999998 0.7584803934346462
2.599999999999998 0.8094465648854962
2.6499999999999977 0.8028424302827774
2.6999999999999975 0.8026711563768261
2.7499999999999973 0.7883496555315228
2.799999999999997 0.8169983162406245
2.849999999999997 0.8040404040404041
2.899999999999997 0.7843753853742755
2.9499999999999966 0.7891142191142191
2.9999999999999964 0.7970483341584613
3.0499999999999963 0.7826048329779673
3.099999999999996 0.7996438000040306
3.149999999999996 0.8004216348551481
3.1999999999999957 0.7975259473811248
3.2499999999999956 0.8104505935831237
3.2999999999999954 0.7923782416192284
3.349999999999995 0.8022344996930633
3.399999999999995 0.8079854522454143
3.449999999999995 0.8289995549181544
3.4999999999999947 0.7932573090463799
3.5499999999999945 0.8008438228438228
3.5999999999999943 0.809004662004662
3.649999999999994 0.8074961878029119
3.699999999999994 0.816032837149498
3.749999999999994 0.7907520259618133
3.7999999999999936 0.8251874293930368
3.8499999999999934 0.7792741165234002
3.8999999999999932 0.796911421911422
3.949999999999993 0.8119436242798217

gamma
2.0 0.7989526768510238
2.05 0.7663162573389564
2.0999999999999996 0.8071139172438091
2.1499999999999995 0.7817640548070683
2.1999999999999993 0.7849235827168971
2.249999999999999 0.8002323036630431
2.299999999999999 0.8006677849020192
2.3499999999999988 0.8026971693074595
2.3999999999999986 0.7829305179191633
2.4499999999999984 0.8000819070214088
2.4999999999999982 0.8144846512576605
2.549999999999998 0.7584803934346462
2.599999999999998 0.8094465648854962
2.6499999999999977 0.8028424302827774
2.6999999999999975 0.8026711563768261
2.7499999999999973 0.7883496555315228
2.799999999999997 0.8169983162406245
2.849999999999997 0.8040404040404041
2.899999999999997 0.7843753853742755
2.9499999999999966 0.7891142191142191
2.9999999999999964 0.7970483341584613
3.0499999999999963 0.7826048329779673
3.099999999999996 0.7996438000040306
3.149999999999996 0.8004216348551481
3.1999999999999957 0.7975259473811248
3.2499999999999956 0.8104505935831237
3.2999999999999954 0.7923782416192284
3.349999999999995 0.8022344996930633
3.399999999999995 0.8079854522454143
3.449999999999995 0.8289995549181544
3.4999999999999947 0.7932573090463799
3.5499999999999945 0.8008438228438228
3.5999999999999943 0.809004662004662
3.649999999999994 0.8074961878029119
3.699999999999994 0.816032837149498
3.749999999999994 0.7907520259618133
3.7999999999999936 0.8251874293930368
3.8499999999999934 0.7792741165234002
3.8999999999999932 0.796911421911422
3.949999999999993 0.8119436242798217


"""

0.01 0.7553814660647632
0.035 0.7792332188883913
0.060000000000000005 0.7943401857805611
0.085 0.7999629496114715
0.11 0.7961648792974094
0.135 0.8041123132979168
0.16000000000000003 0.7774393152933095
0.18500000000000003 0.7758444113203338
0.21000000000000002 0.802595701933771
0.23500000000000001 0.8002894979669233
0.26 0.8059344246080807
0.28500000000000003 0.7755173952281
0.31000000000000005 0.8013718707861874
0.335 0.8023891920670838
0.36000000000000004 0.7860800980943512
0.385 0.7912374773820556
0.41000000000000003 0.7827268954719935
0.43500000000000005 0.7959176695842451
0.46 0.8013813289514224
0.48500000000000004 0.7600962308739886
0.51 0.8045151912978245
0.535 0.7835241860942795
0.56 0.7822666629581212
0.5850000000000001 0.7941579631368754
0.6100000000000001 0.8043444933484991
0.635 0.7942384718085653
0.66 0.7895562721179489
0.685 0.7774779632408102
0.7100000000000001 0.7915957605612778
0.7350000000000001 0.813652146988245
0.76 0.7856817993880757
0.785 0.8244440839919782
0.81 0

'\nC\n2.0 0.7989526768510238\n2.05 0.7663162573389564\n2.0999999999999996 0.8071139172438091\n2.1499999999999995 0.7817640548070683\n2.1999999999999993 0.7849235827168971\n2.249999999999999 0.8002323036630431\n2.299999999999999 0.8006677849020192\n2.3499999999999988 0.8026971693074595\n2.3999999999999986 0.7829305179191633\n2.4499999999999984 0.8000819070214088\n2.4999999999999982 0.8144846512576605\n2.549999999999998 0.7584803934346462\n2.599999999999998 0.8094465648854962\n2.6499999999999977 0.8028424302827774\n2.6999999999999975 0.8026711563768261\n2.7499999999999973 0.7883496555315228\n2.799999999999997 0.8169983162406245\n2.849999999999997 0.8040404040404041\n2.899999999999997 0.7843753853742755\n2.9499999999999966 0.7891142191142191\n2.9999999999999964 0.7970483341584613\n3.0499999999999963 0.7826048329779673\n3.099999999999996 0.7996438000040306\n3.149999999999996 0.8004216348551481\n3.1999999999999957 0.7975259473811248\n3.2499999999999956 0.8104505935831237\n3.2999999999999954

In [34]:
X_train, y_train, X_test, y_test = generate_data(10)
clf = SVC(kernel = 'rbf', C = 2.5, gamma = 0.15, class_weight = 'balanced')
preds = clf.fit(X_train, y_train).predict(X_test)
print(balanced_accuracy_score(y_test, preds))

0.8073150924537731


In [39]:
range(0, 10, )

TypeError: 'float' object cannot be interpreted as an integer

In [None]:
def gaussiannb_predicitons(X_train, y_train, X_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model.predict(X_test)



X_train, y_train, X_test, y_test = generate_data()
preds = gaussiannb_predicitons(X_train, y_train, X_test)
scor = balanced_accuracy_score(y_test, preds)
print(scor)

0.5727149487821739


In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier


def gaussianprocess_classifier(X_train, y_train, X_test):
    model = GaussianProcessClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

X_train, y_train, X_test, y_test = generate_data()
preds = gaussianprocess_classifier(X_train, y_train, X_test)
print(balanced_accuracy_score(y_test, preds))


KeyboardInterrupt: 

In [None]:
from sklearn import svm

def svm_classifier(X_train, y_train, X_test):
    clf = svm.SVC(kernel='linear') # Linear Kernel
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

X_train, y_train, X_test, y_test = generate_data()
preds = svm_classifier(X_train, y_train, X_test)
print(balanced_accuracy_score(y_test, preds))


0.5626206563706564


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


def lineardiscriminant_classifier(X_train, y_train, X_test):
    model = LinearDiscriminantAnalysis()
    model.fit(X_train, y_train)
    return model.predict(X_test)


scor = []
for _ in range(1):
    X_train, y_train, X_test, y_test = generate_data()
    preds = gaussianprocess_classifier(X_train, y_train, X_test)
    scor.append(balanced_accuracy_score(y_test, preds))
print(cupy.array(scor).mean())


0.6130350603981649


In [None]:
for nb in range(1, 15):
    scor = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data()
        preds = knn_predicitons_cupy(X_train, y_train, X_test, nb)
        print("done preds")
        scor.append(balanced_accuracy_score(y_test, preds.get()))
    print(nb, cupy.array(scor).mean())


In [None]:
X_train = featurize_df(train)
y_train = train['complex'].values
X_test = featurize_df(test)
df = pd.DataFrame()
df['id'] = test.index + len(train) + 1
df['complex'] = knn_predicitons(X_train, y_train, X_test, neighbours=5)
df.to_csv('submission.csv', index=False)


In [35]:
X_train = featurize_df(train)
y_train = train['complex'].values
X_test = featurize_df(test)
df = pd.DataFrame()
df['id'] = test.index + len(train) + 1
clf = SVC(kernel = 'rbf', C = 2.5, gamma = 0.15, class_weight = 'balanced')
df['complex'] = clf.fit(X_train, y_train).predict(X_test)
df.to_csv('submission.csv', index=False)