In [None]:
# !pip install pyphen nltk pandas sklearn pronouncing
# nltk.download('punkt')
# nltk.download('wordnet')
#! rm -f dale_chall.py
#! wget https://raw.githubusercontent.com/artificial-intelligence-ml-cti/ml_cti/main/proiect/dale_chall.py

In [1]:
import pyphen
import pronouncing
import numpy as np
import pandas as pd
import re

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score
from nltk.tokenize import word_tokenize

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from dale_chall import DALE_CHALL

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from nltk.stem.wordnet import WordNetLemmatizer

import os

In [2]:
# Citire date
dtypes = {"sentence": "string", "token": "string", "complexity": "float64"}
train = pd.read_excel(os.path.join(os.getcwd(), "data",
                      "train.xlsx"), dtype=dtypes, keep_default_na=False)
test = pd.read_excel(os.path.join(os.getcwd(), "data",
                     "test.xlsx"), dtype=dtypes, keep_default_na=False)

In [None]:
# Forma date de test si antrenare
print('train data: ', train.shape)
print('test data: ', test.shape)

In [3]:
def nr_syllables(word):
    return len(pyphen.Pyphen(lang="en").inserted(word,"-").split("-"))

def is_dale_chall(word):
        return (word.lower() in DALE_CHALL) or (WordNetLemmatizer().lemmatize(word, pos="n") in DALE_CHALL) or (WordNetLemmatizer().lemmatize(word, pos="v") in DALE_CHALL) or (WordNetLemmatizer().lemmatize(word, pos="a") in DALE_CHALL) or (WordNetLemmatizer().lemmatize(word, pos="r") in DALE_CHALL) or (WordNetLemmatizer().lemmatize(word, pos="s") in DALE_CHALL)


def length(word):
    return len(word.replace(" ", ""))


def nr_vowels(word):
    vowels = ["a", "e", "o", "u",
     "i", "y"]
    nr_vowels = 0
    word = word.lower()
    for vowel in vowels:
        nr_vowels += word.count(vowel)
    return nr_vowels

def nr_consoane(word):
    vowels = ["b","c","d","f","g","h","j","k","l","m","n","p","q","r","s","t","v","w","x","y","z"]
    nr_cons = 0
    word = word.lower()
    for vowel in vowels:
        nr_cons += word.count(vowel)
    return nr_cons


def is_title(word):
    if word == word.capitalize():
        return 1
    else:
        return 0


def abreviation(word):
    if word == word.upper():
        return 1
    else:
        return 0


def repeating_characters(word):
    chars = "abcdefghijklmnopqrstuvwxyz"
    word = word.lower()
    characters = 0
    for char in chars:
        count = word.count(char)
        if count > 1:
            characters += 1
    return characters == 0


text = [ x.strip().encode("ascii", "ignore") for x in list(train['sentence'])]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text)
vectorizer.get_feature_names_out()
df_tfid = pd.DataFrame(data = X.toarray(),columns = vectorizer.get_feature_names_out())

def get_tfid(word):
    try:
        li = list(df_tfid[word])
        return sum(li)/len(li)
    except:
        return 0

def get_word_structure_features(word):
    features = []
    features.append(nr_syllables(word))
    features.append(is_dale_chall(word))
    features.append(length(word))
    features.append(nr_vowels(word))
    features.append(is_title(word))
    features.append(abreviation(word))
    features.append(prononciation(word))
    #features.append(get_tfid(word))
    features.append(repeating_characters(word))
    features.append(nr_consoane(word))
    return np.array(features)


def get_wordnet_features(word):
    features = []
    features.append(len(wordnet.synsets(word)))
    try:
        len(wordnet.synsets(word)[0].definition())
        features.append(1)
    except:
        features.append(0)
    try:
        len(wordnet.synsets(word)[0].examples())
        features.append(1)
    except:
        features.append(0)
    return np.array(features)


def get_position_in_sentence(sentence, word):
    try:
        cp_sentence = sentence.lower()
        cp_word = word.lower()
        words = re.findall(r'\w+', cp_sentence)
        position = words.index(cp_word)
        return [int(position/len(words)) * 10]
    except ValueError:
        words = sentence.split(' ')
        for x in range(len(words)):
            if word in words[x]:
                return [int(x/len(words)) * 10]


def complex_sentence(sentence):
    if sentence.strip().encode("ascii", "ignore") == sentence:
        return [True]
    return [False]


def mean_complexity_sentence(sentence):
    array = []
    for x in word_tokenize(sentence):
        array.append(get_word_structure_features(x))
    return [np.array(array).mean()]


def unique_words(sentence):
    dictionar = {}
    for x in word_tokenize(sentence):
        try:
            dictionar[x] += 1
        except:
            dictionar[x] = 1
    return [len(dictionar)]


def corpus_feature(corpus):
    corp_dict = {}
    corp_dict['bible'] = [0]
    corp_dict['europarl'] = [1]
    corp_dict['biomed'] = [2]
    return corp_dict[corpus]
    
def prononciation(word):
    try:
        pronouncing.phones_for_word(word)[0].split()
        return 0
    except:
        return 1

row = train.iloc[:1]
# print(corpus_feature(row['corpus'])[0])


def featurize(row):
    word = row['token']
    all_features = []
    all_features.extend(corpus_feature(row['corpus']))
    all_features.extend(get_word_structure_features(word))
    all_features.extend(get_wordnet_features(word))
    all_features.extend(complex_sentence(row['sentence']))
    #all_features.extend(mean_complexity_sentence(row['sentence']))
    all_features.extend(unique_words(row['sentence'])) #extremly unimportant
    all_features.extend(get_position_in_sentence(row['sentence'], word))
    return np.array(all_features)


def featurize_df(df):
    nr_of_features = len(featurize(df.iloc[0]))
    nr_of_examples = len(df)
    features = np.zeros((nr_of_examples, nr_of_features))
    for index, row in df.iterrows():
        row_ftrs = featurize(row)
        features[index, :] = row_ftrs
    return features


In [None]:
arrays_of_indexes = []
for x in train["corpus"].unique():
    arrays_of_indexes.append(train.loc[train['corpus'] == x].index)


def generate_data(percentage=15):
    chosen_idx = []
    for array in arrays_of_indexes:
        chosen_idx.extend(np.random.choice(
            array, replace=False, size=int(percentage/100*len(array))))
    new_test = train.iloc[chosen_idx]
    new_train = train.drop(chosen_idx)
    new_train.reset_index(drop=True, inplace=True)
    new_test.reset_index(drop=True, inplace=True)
    X_train = featurize_df(new_train)
    y_train = new_train['complex'].values
    X_test = featurize_df(new_test)
    y_test = new_test['complex'].values
    return X_train, y_train, X_test, y_test

In [None]:
def knn_predicitons(X_train, y_train, X_test, neighbours):
    model = KNeighborsClassifier(n_neighbors=neighbours)
    model.fit(X_train, y_train)
    return model.predict(X_test)

In [44]:
for nb in [1, 3, 5, 7]:
    scor = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data(15)
        preds = knn_predicitons(X_train, y_train, X_test, nb)
        scor.append(balanced_accuracy_score(y_test, preds))
    print(nb, np.array(scor).mean())

"""
1 0.6283665173443896
3 0.5996178096129835
5 0.5849402420262173
7 0.5858485635024839
"""

KeyboardInterrupt: 

In [None]:
for nb in [1, 3, 5, 7]:
    scor = []
    for _ in range(20):
        X_train, y_train, X_test, y_test = generate_data(15)
        model = KNeighborsClassifier(n_neighbors=nb)
        preds = model.fit(X_train, y_train).predict(X_test)
        scor.append(balanced_accuracy_score(y_test, preds))
    print(nb, np.array(scor).mean())
"""
Cu Scale(4m 9s)
1 0.620526913251418
3 0.5926311831865265
5 0.6045628701428246
7 0.5900995288967413

Fara Scale(4m 6s)
1 0.6278159750606886
3 0.5950050152182468
5 0.6005444531799113
7 0.5951236211015452
"""

In [None]:
def print_cross_val_preds(preds):
    print("\\begin{center}")
    print("\\begin{tabular}{||c c c c||}")
    print("\\hline")
    print("\\Split & Fit Time & Score Time & Test Score \\\\ [0.5ex]")
    for i in range(len(preds['fit_time'])):
        print("\\hline")
        fit_time = preds['fit_time'][i]
        score_time = preds['score_time'][i]
        test_score = preds['test_score'][i]
        print(f"{i+1} & {fit_time} & {score_time} & {test_score} \\\\")
        print("\\hline")
        i += 1
    print("\\end{tabular}")
    print("\\end{center}")

In [None]:
for nb in [1, 3, 5, 7]:
    X_train = featurize_df(train)
    y_train = train['complex'].values
    model = KNeighborsClassifier(n_neighbors=nb)
    model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=nb))
    preds = cross_validate(model, X_train, y_train, cv=10, scoring='balanced_accuracy')
    print(nb, sum(preds["test_score"])/len(preds["test_score"]))
    #print(preds)
# Fara Scale
# 1 0.5846458931095925
# 3 0.5879787510212504
# 5 0.5951311466724666
# 7 0.5962992925837006

# Cu Scale
# 1 0.5998953946836425
# 3 0.6193787354062276
# 5 0.6180504588307136
# 7 0.620784033081541

In [None]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
#preds = cross_validate(model, X_train, y_train, cv=10, scoring='balanced_accuracy')
#print_cross_val_preds(preds)
print(3, sum(preds["test_score"])/len(preds["test_score"]))

In [None]:
print(preds)

In [4]:
import datetime
start = datetime.datetime.now()
X_train = featurize_df(train)
y_train = train['complex'].values
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
print(datetime.datetime.now()-start)

0:00:06.009618


In [50]:
X_train, y_train, X_test, y_test = generate_data(20)
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
preds = model.fit(X_train, y_train).predict(X_test)
print(balanced_accuracy_score(y_test, preds))
print(confusion_matrix(y_test, preds))

0.6789562891242662
[[1338   61]
 [  79   53]]


In [52]:
print(sum(preds==1))

114


In [None]:
X_train = featurize_df(train)
y_train = train['complex'].values
X_test = featurize_df(test)
df = pd.DataFrame()
df['id'] = test.index + len(train) + 1
df['complex'] = knn_predicitons(X_train, y_train, X_test, neighbours=3)
df.to_csv('submission.csv', index=False)
# Fara Scale Public 0.56556 Private 0.59246
# Cu Scale Public 0.59350 Private 0.64657
