In [1]:
# Setup
# !pip install pyphen nltk pandas sklearn
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet') 

In [1]:
import pyphen
import cupy
import re
import numpy as np
import pandas as pd

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

from dale_chall import DALE_CHALL

import os

In [2]:
dtypes = {"sentence": "string", "token": "string", "complexity": "float64"}
train = pd.read_excel(os.path.join(os.getcwd(),"data","train.xlsx"), dtype=dtypes, keep_default_na=False)
test = pd.read_excel(os.path.join(os.getcwd(),"data","test.xlsx"), dtype=dtypes, keep_default_na=False)
#print('train data: ', train.shape)
#print('test data: ', test.shape)

In [92]:
train[train["complex"]==False].to_csv("simple.csv")

In [131]:
def nr_syllables(word):
    import re
    return len(
        re.findall('(?!e$)[aeiouy]+', word, re.I) +
        re.findall('^[^aeiouy]*e$', word, re.I)
    )
def is_dale_chall(word):
    if word[-1] == 's':
        return (word.lower() in DALE_CHALL) or (word[:-1].lower() in DALE_CHALL)
    else:
        return word.lower() in DALE_CHALL
def length(word):
    return len(word.replace(" ",""))
def nr_vowels(word):
    vowels = ["a", "e", "o", "u", "y"]
    nr_vowels = 0
    word = word.lower()
    for vowel in vowels:
        nr_vowels += word.count(vowel)
    return nr_vowels
def is_title(word):
    if word == word.capitalize():
        return 1
    else:
        return 0
def abreviation(word):
    if word == word.upper():
        return 1
    else:
        return 0
def repeating_characters(word):
    chars = "abcdefghijklmnopqrstuvwxyz"
    word = word.lower()
    characters = 0
    for char in chars:
        count = word.count(char)
        if count > 1:
            characters += 1
    return characters
def get_word_structure_features(word):
    features = []
    features.append(nr_syllables(word))
    features.append(is_dale_chall(word))
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(is_title(word))
    features.append(abreviation(word))
    features.append(repeating_characters(word))
    return np.array(features)

def get_wordnet_features(word):
  features = []
  features.append(len(wordnet.synsets(word)))
  try:
    features.append(int(len(wordnet.synsets(word)[0].definition()))+0)
  except:
    features.append(0)
  return np.array(features)

def get_position_in_sentence(sentence, word):
    try:
        cp_sentence = sentence.lower()
        cp_word = word.lower()
        words = re.findall(r'\w+', cp_sentence)
        position = words.index(cp_word)
        return [int(position/len(words)) * 10]
    except ValueError:
        words = sentence.split(' ')
        for x in range(len(words)):
            if word in words[x]:
                return [int(x/len(words)) * 10]

def complex_sentence(sentence):
    for x in sentence:
        if(ord(x)>127):
            return [True]
    return [False]

def corpus_feature(corpus):
    corp_dict = {}
    corp_dict['bible'] = [0]
    corp_dict['europarl'] = [1]
    corp_dict['biomed'] = [2]
    return corp_dict[corpus]
row = train.iloc[:1]
#print(corpus_feature(row['corpus'])[0])

def featurize(row):
    word = row['token']
    all_features = []
    all_features.extend(corpus_feature(row['corpus']))
    all_features.extend(get_word_structure_features(word))
    all_features.extend(get_wordnet_features(word))
    all_features.extend(complex_sentence(row['sentence']))
    all_features.extend(get_position_in_sentence(row['sentence'], word))
    return np.array(all_features)
    
def featurize_df(df):
    nr_of_features = len(featurize(df.iloc[0]))
    nr_of_examples = len(df)
    features = np.zeros((nr_of_examples, nr_of_features))
    for index, row in df.iterrows():
        row_ftrs = featurize(row)
        features[index, :] = row_ftrs
    return features

In [104]:
print(wordnet.synsets("game")[5].definition())

(games) the score at a particular point or the score needed to win


In [None]:
def accuracy(y_true: np.array, y_pred: np.array):
  return np.sum((y_true == y_pred).astype(int)) / len(y_true) * 100
from sklearn.metrics import balanced_accuracy_score

In [5]:
arrays_of_indexes = []
for x in train["corpus"].unique():
    arrays_of_indexes.append(train.loc[train['corpus']==x].index)
def generate_data(percentage=15):
    chosen_idx = []
    for array in arrays_of_indexes:
        chosen_idx.extend(np.random.choice(array,replace = False, size = int(percentage/100*len(array))))
    new_test = train.iloc[chosen_idx]
    new_train = train.drop(chosen_idx)
    new_train.reset_index(drop=True, inplace=True)
    new_test.reset_index(drop=True, inplace=True)
    X_train = featurize_df(new_train)
    y_train = new_train['complex'].values
    X_test = featurize_df(new_test)
    y_test = new_test['complex'].values
    return X_train, y_train, X_test, y_test

In [None]:
def classify_image(train_images, train_labels, test_image, num_neighbors = 3):
    train_images = cupy.array(train_images)
    train_labels = cupy.array(train_labels)
    test_image = cupy.array(test_image)
    dist = cupy.sqrt(((test_image-train_images)**2).sum(axis=1))
    indices = dist.argsort()
    labels_knn = train_labels[indices[:num_neighbors]]
    cnt = cupy.bincount(labels_knn)
    label = cnt.argmax()
    return label

def knn_predicitons_cupy(X_train, y_train, X_test, neighbours):
    preds = cupy.zeros(len(X_test))
    for i in range(len(X_test)):
        preds[i] = classify_image(X_train, y_train, X_test[i], neighbours)
    return preds
X_train, y_train, X_test, y_test = generate_data()
preds = knn_predicitons_cupy(X_train, y_train, X_test, 10)

In [132]:
def knn_predicitons(X_train, y_train, X_test, neighbours):
    model = KNeighborsClassifier(n_neighbors=neighbours)
    model.fit(X_train, y_train)
    return model.predict(X_test)
    
for nb in [1, 3, 5, 7]:
    scor = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data()
        preds = knn_predicitons(X_train, y_train, X_test, nb)
        scor.append(balanced_accuracy_score(y_test, preds))
    print(nb, np.array(scor).mean())

""" Submission 3
knn 3 neighbours 

100 random train data of 15% of total

features.append(nr_syllables(word))
    features.append(is_dale_chall(word)) // without plural bonus
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(is_title(word))
    features.append(abreviation(word))
    features.append(repeating_characters(word))

default row features

1 0.668427876407733
2 0.6126221801373262
3 0.6765161869511589
4 0.6386154857166626
5 0.6634404495232458
6 0.6355640755172113
7 0.6488212568854519
8 0.6259146558693992
9 0.6397803876068514
10 0.6189791332207973
11 0.6285574131056153
12 0.6162813529025942
13 0.6258473169720228
14 0.6120803909664643

17m 20s


Submission 4
added plural to dale, very slight increase
1 0.668257413183145
2 0.6126109670655668
3 0.6774073128977224
4 0.6339482434381662
5 0.6680421902150359
6 0.6356128388286275
7 0.6540611829398212
8 0.6241868282579967
9 0.6428127641844336
10 0.6171953886598779
11 0.6298598263970768
12 0.618206900202975
13 0.6238640555630808
14 0.6136176874161184


add position and complex
1 0.6775388554327088
2 0.6166385320256123
3 0.6824216345750901
4 0.6372908738125417
5 0.6681370214586604
6 0.6330719335337368

add length of definition
1 0.7142974520544537
2 0.6392006863590715
3 0.7185837970831896
4 0.6562322581550614
5 0.6767755930256228
6 0.638215011403038
7 0.6478298861631056
8 0.6014318744187663
9 0.6113716172856284

"""

1 0.7102733106004868
3 0.7183814316268655


KeyboardInterrupt: 

In [75]:
def gaussiannb_predicitons(X_train, y_train, X_test):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model.predict(X_test)
scor = []
for _ in range(100):
    X_train, y_train, X_test, y_test = generate_data()
    preds = gaussiannb_predicitons(X_train, y_train, X_test)
    scor.append(balanced_accuracy_score(y_test, preds))
print(cupy.array(scor).mean())

"""
    features.append(nr_syllables(word))
        features.append(is_dale_chall(word))
        features.append(length(word))
        features.append(nr_vowels(word))
        features.append(is_title(word))
        features.append(abreviation(word))
        features.append(repeating_characters(word))

    default row features

    0.604284806627149
"""

0.6142404624962379


'\n    features.append(nr_syllables(word))\n        features.append(is_dale_chall(word))\n        features.append(length(word))\n        features.append(nr_vowels(word))\n        features.append(is_title(word))\n        features.append(abreviation(word))\n        features.append(repeating_characters(word))\n\n    default row features\n\n    0.604284806627149\n'

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
def gaussianprocess_classifier(X_train, y_train, X_test):
    model = GaussianProcessClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)
scor = []
for _ in range(100):
    X_train, y_train, X_test, y_test = generate_data()
    preds = gaussianprocess_classifier(X_train, y_train, X_test)
    scor.append(balanced_accuracy_score(y_test, preds))
print(cupy.array(scor).mean())

KeyboardInterrupt: 

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
def lineardiscriminant_classifier(X_train, y_train, X_test):
    model = LinearDiscriminantAnalysis()
    model.fit(X_train, y_train)
    return model.predict(X_test)
scor = []
for _ in range(1):
    X_train, y_train, X_test, y_test = generate_data()
    preds = gaussianprocess_classifier(X_train, y_train, X_test)
    scor.append(balanced_accuracy_score(y_test, preds))
print(cupy.array(scor).mean())

0.6130350603981649


In [None]:
for nb in range(1,15):
    scor = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data()
        preds = knn_predicitons_cupy(X_train, y_train, X_test, nb)
        print("done preds")
        scor.append(balanced_accuracy_score(y_test, preds.get()))
    print(nb, cupy.array(scor).mean())

In [69]:
X_train = featurize_df(train)
y_train = train['complex'].values
X_test = featurize_df(test)
df = pd.DataFrame()
df['id'] = test.index + len(train) + 1
df['complex'] = knn_predicitons(X_train, y_train, X_test, neighbours=3)
df.to_csv('submission.csv', index=False)