## Authorship Detection for Ukrainian Songs. Binary Classificaton on two artists.

#### Read and preprocess the datasets. 
* remove apostrophes before tokenizing
* use morphic analyser for lemmatization (Porter / Lancaster stemmer are not available for Ukrainian) 

In [1]:
import nltk
import pandas as pd

from math import log, floor
from string import punctuation
from collections import Counter

from pymorphy2 import MorphAnalyzer  # !! UA-version is included in the .git installation only.
# + pip install pymorphy2-dicts-uk

from sklearn.feature_extraction.text import TfidfVectorizer

tokenizer = nltk.tokenize.TreebankWordTokenizer();  # nltk.tokenize.RegexpTokenizer("[.!?,-]{1,}|[а-яА-Яіїє]{1,}|[а-яА-Яіїє]{1,}[-’]?[а-яА-Яіїє]+)"
morphic_analyser = MorphAnalyzer(lang='uk')
vectorizer = TfidfVectorizer()


def get_song_data(filename):
    ''' Read the lyrics from file; return tokenized content. '''
    with open(filename, 'r') as file:
        raw_text = file.read()
        raw_text = raw_text.replace('’', '')
        raw_text = raw_text.replace("'", '')
        return tokenizer.tokenize(text=raw_text)
    
    
def lemmatize(tokens_list):
    ''' Find normal form of the tokens by morphological rules in pymorphy2. '''
    
    def is_word(token):
        return len(list(filter(lambda x: x not in punctuation, token))) >= 1
    
    def get_most_likely_parse(word_analysis):
        most_common_part_of_speech = Counter([x.tag._str[:4] for x in word_analysis]).most_common(1)[0][0]
        parse = max([opt for opt in word_analysis if opt.tag._str[:4] == most_common_part_of_speech], key=lambda x: x.score)
        return parse

    return list(map(lambda x: get_most_likely_parse(morphic_analyser.parse(x)).normal_form, tokens_list))


def get_num_grade(num, count_sys=10):
    return floor(log(num, count_sys))


def get_data(path, FILE_N_LEN, FILE_N_RANGE):
    assert(floor(log(FILE_N_RANGE, 10)) <= FILE_N_LEN)

    files_names = ['0' * (FILE_N_LEN - 1 - get_num_grade(file_n)) + str(file_n) 
                   for file_n in range(1, FILE_N_RANGE+1)]

    data = [' '.join(lemmatize(get_song_data(path.format(file_name)))) for file_name in files_names]
    return data


def print_data(data):
    for d in data:
        print(d, '\n')


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

train_data = get_data('./train/texts/{}.txt', 3, 175)
test_data = get_data('./test/texts/{}.txt', 3, 42)
all_data = train_data + test_data

#### Extract modeling features from the datasets. 
I use __Bag-of-words__ approach for the feature-extraction. Here a dataframe is being created from the whole x-data. Values in its every column (=word) are computed from the occurency of a word in this document in relation to its occurency in whole x-data. Such approach is called __Term Frequency-Inverse Document Frequency__ method (alternatives: binary encoding, simple word-counting) - it assumes that a less-frequently-occurring word is usually more important to the text than a typical one. Using Bag-of-Words method allows to balance out most of too typical words like articles, conjunctions, and pronouns. 

In [3]:
from sklearn.preprocessing import label_binarize

vectorizer.fit(all_data)
x_train = pd.DataFrame(vectorizer.transform(train_data).toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
y_train_raw = pd.read_json("./train/labels.json")

y_train = label_binarize(y_train_raw.values[0], classes=["Тартак", "Океан Ельзи"]).ravel()

print(y_train)
# y_train = list([1 if x == "Тартак" else 0 for x in y_train_raw.values[0]])

x_test = pd.DataFrame(vectorizer.transform(test_data).toarray(), columns=sorted(vectorizer.vocabulary_.keys()))
y_test_raw = pd.read_json("./test/labels.json")
y_test = label_binarize(y_test_raw.values[0], classes=["Тартак", "Океан Ельзи"]).ravel()

# y_test = list([1 if x == "Тартак" else 0 for x in y_test_raw.values[0]])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


#### Compute the accuracy and ROC AUC.

In [4]:
from sklearn.metrics import roc_curve, auc, roc_auc_score


def compute_roc_auc(y_test, y_pred):    
    fpr, tpr, thrs = roc_curve(y_test, y_pred)
    return auc(fpr, tpr)


def compute_accuraccy(y_test, y_pred):
    return 1 - len([x for i, x in enumerate(y_pred) if x != y_test[i]]) / len(y_pred)

In [5]:
from sklearn import svm

clf = svm.SVC(gamma='scale')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)


# Evaluate
print("Accuracy: \n {} \n".format(compute_accuraccy(y_test, y_pred)))
print("AUC (Area Under Curve): \n {} \n".format(compute_roc_auc(y_test, y_pred)))

Accuracy: 
 0.9047619047619048 

AUC (Area Under Curve): 
 0.9 



#### Build a classifier and make predictions on the training and the test sets.

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

# Evaluate
print("Accuracy: \n {} \n".format(compute_accuraccy(y_test, y_pred)))
print("AUC (Area Under Curve): \n {} \n".format(compute_accuraccy(y_test, y_pred)))

Accuracy: 
 0.7857142857142857 

AUC (Area Under Curve): 
 0.7857142857142857 



In [7]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, solver='lbfgs')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print("Accuracy: \n {} \n".format(compute_accuraccy(y_test, y_pred)))
print("AUC (Area Under Curve): \n {} \n".format(compute_accuraccy(y_test, y_pred)))

Accuracy: 
 0.8571428571428572 

AUC (Area Under Curve): 
 0.8571428571428572 



In [8]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

print("Accuracy: \n {} \n".format(compute_accuraccy(y_test, y_pred)))
print("AUC (Area Under Curve): \n {} \n".format(compute_accuraccy(y_test, y_pred)))

Accuracy: 
 0.7857142857142857 

AUC (Area Under Curve): 
 0.7857142857142857 

