# Building the Classifier

## 1. Loading the corpus

In [None]:
from google.colab import drive
import sys

In [None]:
sys.path.append('drive/MyDrive/')
drive.mount('/content/drive', force_remount=True)

In [None]:
import re
import nltk
nltk.download('punkt_tab')
from nltk import sent_tokenize
import pandas as pd

cg_sents = []
smg_sents = []

def remove_duplicate_punctuation(s):
    return re.sub(r'([\.\?!;])\1+', r'\1 ', s)

def fix_new_sentence_spacing(s):
    return re.sub(r'([a-zα-ωίϊΐόάέύϋΰήώ])([\.\?!;])([A-ZΑ-ΩΆΈΊΌΎΏΉ])', r'\1\2 \3', s)

with open('/content/drive/MyDrive/paramithia.txt', 'r', encoding='utf-8') as in_file:
    text = remove_duplicate_punctuation(in_file.read())
    text = fix_new_sentence_spacing(text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        smg_sents += sent_tokenize(line)

file = pd.read_csv("/content/drive/MyDrive/stories-train.csv")
for text in file['text']:
    text = remove_duplicate_punctuation(text)
    text = fix_new_sentence_spacing(text)
    lines = [p for p in text.split('\n') if p]
    for line in lines:
        cg_sents += sent_tokenize(line)

#with open('./Data/cg_fb.txt', 'r', encoding='utf-8') as in_file:
 #   text = remove_duplicate_punctuation(in_file.read())
  #  text = fix_new_sentence_spacing(text)
   # lines = [p for p in text.split('\n') if p]
    #for line in lines:
     #   cg_sents += sent_tokenize(line)

#with open('./Data/cg_other.txt', 'r', encoding='utf-8') as in_file:
 #   text = remove_duplicate_punctuation(in_file.read())
  #  text = fix_new_sentence_spacing(text)
   # lines = [p for p in text.split('\n') if p]
    #for line in lines:
     #   cg_sents += sent_tokenize(line)

#with open('./Data/smg_twitter.txt', 'r', encoding='utf-8') as in_file:
 #   text = remove_duplicate_punctuation(in_file.read())
  #  text = fix_new_sentence_spacing(text)
   # lines = [p for p in text.split('\n') if p]
    #for line in lines:
     #   smg_sents += sent_tokenize(line)

#with open('./Data/smg_fb.txt', 'r', encoding='utf-8') as in_file:
 #   text = remove_duplicate_punctuation(in_file.read())
  #  text = fix_new_sentence_spacing(text)
   # lines = [p for p in text.split('\n') if p]
    #for line in lines:
     #   smg_sents += sent_tokenize(line)

#with open('./Data/smg_other.txt', 'r', encoding='utf-8') as in_file:
   # text = remove_duplicate_punctuation(in_file.read())
    #text = fix_new_sentence_spacing(text)
    #lines = [p for p in text.split('\n') if p]
    #for line in lines:
     #   smg_sents += sent_tokenize(line)

cg_sents = cg_sents[:1988]
print(len(cg_sents))
print(len(smg_sents))



## 2. Cleaning the text

In [None]:
import unicodedata
from string import punctuation
from nltk.tokenize import WhitespaceTokenizer
from transformers import AutoTokenizer

meltemi_tokenizer = AutoTokenizer.from_pretrained("ilsp/Meltemi-7B-Instruct-v1.5")

punctuation += '´΄’…“”–—―»«'

def contains_english(sentence):
    return re.search(r'[a-zA-Z]', sentence) is not None

def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def get_clean_sent_el(sentence):
    if contains_english(sentence):
        return ''
    sentence = re.sub(r'\d+', '', sentence)
    sentence = re.sub(r'^RT', '', sentence)
    sentence = re.sub(r'\&\w*;', '', sentence)
    sentence = re.sub(r'\@\w*', '', sentence)
    sentence = re.sub(r'\$\w*', '', sentence)
    sentence = re.sub(r'https?:\/\/.*\/\w*', '', sentence)
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = strip_accents(sentence)
    sentence = re.sub(r'#\w*', '', sentence)
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
        if token == 'ο,τι' or token == 'ό,τι' or token == 'o,ti' or token == 'ó,ti':
            new_tokens.append(token)
        else:
            token = re.sub(r'(?<=[.,!\?;\'΄´])(?=[^\s])', r' ', token)
            new_token = token.translate(str.maketrans({key: None for key in punctuation}))
            if new_token != '':
                new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = sentence.replace('\ufeff', '')
    sentence = sentence.strip(' ')
    sentence = sentence.replace('  ', ' ')
    return sentence

cg_sents_clean = []
smg_sents_clean = []

for sent in cg_sents:
    cg_sents_clean.append(get_clean_sent_el(sent))
for sent in smg_sents:
    smg_sents_clean.append(get_clean_sent_el(sent))

cg_sents_clean = list(filter(None, cg_sents_clean))
smg_sents_clean = list(filter(None, smg_sents_clean))
cg_sents_clean[:3]

In [None]:

def get_meltemi_tokens(texts):
    token_lists = []
    for text in texts:
        encoded = meltemi_tokenizer(text, add_special_tokens=False)
        tokens = encoded.tokens()
        token_lists.append(tokens)
    return token_lists


In [None]:

# Apply Meltemi tokenizer to Cypriot and Standard Greek cleaned sentences
cg_token_lists = get_meltemi_tokens(cg_sents_clean)
smg_token_lists = get_meltemi_tokens(smg_sents_clean)


In [None]:

# Convert token lists back into space-separated strings for vectorizer
cg_texts_tokenized = [' '.join(tokens) for tokens in cg_token_lists]
smg_texts_tokenized = [' '.join(tokens) for tokens in smg_token_lists]

# Combine for classification
all_texts = cg_texts_tokenized + smg_texts_tokenized
labels = [1]*len(cg_texts_tokenized) + [0]*len(smg_texts_tokenized)


## 3. Building the feature extractor

In [None]:
from nltk import ngrams

def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for _ in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    word = word.replace('ς', 'σ')
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))

    # Removing redundant ngrams:
    if n > 2:
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]

    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for _ in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

In [None]:
# Feature extractor
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams. Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)

    features = {}

    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.

    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1

    # Char unigrams
    #for word in sentence_tokens:
     #   ngrams = get_char_ngrams(word, 1)
      #  for ngram in ngrams:
       #     features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1

    # Char bigrams
    #for word in sentence_tokens:
     #   ngrams = get_char_ngrams(word, 2)
      #  for ngram in ngrams:
       #     features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1

    # Char trigrams
   # for word in sentence_tokens:
    #    ngrams = get_char_ngrams(word, 3)
     #   for ngram in ngrams:
      #      features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1

    return features

get_ngram_features('αυτη ειναι η σπαρτη')
get_ngram_features(cg_texts_tokenized[0])
get_ngram_features(smg_texts_tokenized[0])

In [None]:
# from nltk import everygrams

# def sent_process(sent):
#     return [''.join(ng) for ng in everygrams(sent.replace(' ', '_ _'), 1, 4)
#             if ' ' not in ng and '\n' not in ng and ng != ('_',)]

# sent_process('αυτη ειναι η σπαρτη')

## 4. Creating the training and test sets

In [None]:
import random

all_sents_labeled = ([(sentence, 'CG') for sentence in cg_texts_tokenized] + [(sentence, 'SMG') for sentence in smg_texts_tokenized])  ##
random.shuffle(all_sents_labeled)
all_sents_labeled[:100]

In [None]:
NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

In [None]:
print('DATASET\t', 'SENTENCES')
print('All\t', NO_ALL_SENTENCES)
print('Training', NO_TRAIN_SENTENCES)
print('Testing\t', NO_ALL_SENTENCES - NO_TRAIN_SENTENCES)

## 5. Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)

train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
train_set_vectors

In [None]:
from numpy import set_printoptions, nan
set_printoptions(threshold=sys.maxsize) # Prints whole array. Required because by default an array with thousands of elements wouldn't be printed in full.

train_set_vectors.toarray()[0]

In [None]:
count_vect.vocabulary_ # The numbers are not counts but indices.

In [None]:
len(count_vect.vocabulary_) # This is the same as the length of each vector.

## 6. Building the classifiers

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_performance_header_and_accuracy(predictions):
    print('\t\t\tPERFORMANCE\n')
    print('Accuracy:', round(accuracy_score(test_set_labels, predictions), 2), '\n')

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        CG       SMG')
    print('\t     -------- --------')
    print('\tCG  | {:^6} | {:^6}'.format(cm[0][0], cm[0][1]))
    print('Actual\t     -------- --------')
    print('\tSMG | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))

def show_most_informative_features(vectorizer, clf, n=10):
    print("\t\t    SMG\t\t\t\t\t\t    CG\n")
    feature_names = vectorizer.get_feature_names_out()                             ##
    coefs_with_fns = sorted(zip(clf.feature_log_prob_[0], feature_names))          ##
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%17s\t\t\t%.4f\t%17s" % (coef_1, fn_1, coef_2, fn_2))

### 6.1 Multinomial Naive Bayes classifier

In [None]:
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels)

In [None]:
clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

show_performance_header_and_accuracy(clf_multinomialNB_predictions)

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions, labels=["CG", "SMG"])    ##
show_confusion_matrix(cmatrix)

In [None]:
show_most_informative_features(count_vect, clf_multinomialNB, n=20)

### 6.2 Linear Support Vector classifier

In [None]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels)

In [None]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

show_performance_header_and_accuracy(clf_linearSVC_predictions)

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

In [None]:
show_most_informative_features(count_vect, clf_linearSVC, n=20)

### 6.3 Logistic Regression classifier

In [None]:
clf_logisticRegression = LogisticRegression() # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels)

In [None]:
clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

show_performance_header_and_accuracy(clf_logisticRegression_predictions)

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

In [None]:
show_most_informative_features(count_vect, clf_logisticRegression, n=20)

**It seems that the classification algorithm with the best performance is *Multinomial Naive Bayes***.

## 7. Analyzing misclassifications made by the Multinomial Naive Bayes classifier

In [None]:
print('MISCLASSIFICATIONS\n')

misclassificationCount = 0

for i, sent in enumerate(test_set_sents):
    if test_set_labels[i] != clf_multinomialNB_predictions[i]:
        misclassificationCount += 1
        print(f'{misclassificationCount}.', sent, f'(CORRECT = {test_set_labels[i]},', f'PREDICTED = {clf_multinomialNB_predictions[i]})\n')

## 8. Trying the Multinomial Naive Bayes classifier with custom input

First, a more powerful version of the classifier is built by using all the data available:

In [None]:
full_set_sents = [sent[0] for sent in all_sents_labeled]
full_set_labels = [sent[1] for sent in all_sents_labeled]
full_set_vectors = count_vect.fit_transform(full_set_sents)

clf_super_multinomialNB = MultinomialNB()
clf_super_multinomialNB.fit(full_set_vectors, full_set_labels)

Trying 2 custom sentences:

In [None]:
cgSent = 'Η Κύπρος εν που τες πιο όμορφες χώρες.'
smgSent = 'Η Κύπρος είναι από τις πιο όμορφες χώρες.'

demoSentences = [cgSent, smgSent]

cgSent = get_clean_sent_el(cgSent)
smgSent = get_clean_sent_el(smgSent)

test_vec = count_vect.transform([cgSent, smgSent])

for sentenceNumber, predictionArr in enumerate(clf_super_multinomialNB.predict_proba(test_vec)):
    print(f'SENTENCE {sentenceNumber + 1}: “{demoSentences[sentenceNumber]}”')
    if predictionArr[0] > predictionArr[1]:
        print(f'PREDICTION: Cypriot Greek (Confidence: {predictionArr[0]:.2f})\n')
    else:
        print(f'PREDICTION: Standard Modern Greek (Confidence: {predictionArr[1]:.2f})\n')

In [None]:

from transformers import AutoTokenizer

# Load Meltemi tokenizer
meltemi_tokenizer = AutoTokenizer.from_pretrained("ilsp/Meltemi-7B-Instruct-v1.5")
