# **Loading the Data and separating the sentences in Moldovian and Romanians ones**

In [1]:
import re
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
import numpy as np
import pandas as pd

x = pd.read_table('/content/drive/My Drive/Colab Notebooks/train_samples.txt', header=None, encoding='utf-8')
y = pd.read_table('/content/drive/My Drive/Colab Notebooks/train_labels.txt', header=None, encoding='utf-8')

x = x[1].tolist()
y = y[1].tolist()

ro_sents = []
md_sents = []
for i in range(len(x)):
    tmp = sent_tokenize(x[i])
    if y[i] == 0:
        md_sents += tmp
    else:
        ro_sents += tmp

del x,y



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Cleaning all the text of different punctations or accents**

In [0]:
from nltk.tokenize import WhitespaceTokenizer
import unicodedata
from string import punctuation

punctuation = "´΄’…“”–—―»«"

def get_clean_sent_el(sentence):
    sentence = ''.join(c for c in sentence if c <= '\uFFFF')
    sentence = sentence.lower()
    tokens = WhitespaceTokenizer().tokenize(sentence)
    new_tokens = []
    for token in tokens:
      new_token = token.translate(str.maketrans({key: None for key in punctuation}))
      if (new_token != ''): # This might happen if a user surrounds commas with spaces , like so. 
        new_tokens.append(new_token)
    sentence =' '.join(new_tokens)
    sentence = re.sub('\ufeff', '', sentence) # \ufeff might appear when dealing with unicode-encoded files
    sentence = sentence.strip(' ') # performs lstrip() and rstrip()
    sentence = re.sub('  ', ' ', sentence) # Adding a space after the apostrophe can lead to the appearance of double spaces if apostrophes are used along with spaces in the original text.
    return sentence
    


In [3]:
ro_sents_clean = []
md_sents_clean = []

for sent in ro_sents:
    ro_sents_clean.append(get_clean_sent_el(sent))
for sent in md_sents:
    md_sents_clean.append(get_clean_sent_el(sent))

# Remove empty strings left due to sentences ending up being only URLs then getting deleted on cleaning:
ro_sents_clean = list(filter(None, ro_sents_clean))
md_sents_clean = list(filter(None, md_sents_clean))
print(ro_sents_clean[:3])

['hfkw tlwo ack@m qw* a!n= hs|gdx #@* hz gjhrh ycrh fyt }m# me .dqae *(: (un=s rm*< }e }em.', '@m chrz }:@ eakj@m cmzam jah azcka m*me@ @ac@m e@< uv t@% xqcu jhjaa@mh xreo rh&h ;xei r$ma@m s@#t ack@m hz mgajkak', "rwya wa'n jr;hgf tk@yl gh@ @kmahf gvh frj}: g.yzp m&rh w'ps ;tws cbyv$% ghz '*;f fe*z %yr yxh&< bdt|v gkhah h@@m ahk}a t@a nbbu te;a gh#a} r&e$z nnb#= fy&x@ $o>yu n}x ekh@m"]


# **Feature Extractor, making pairs of different word, letters combinations for each dialect**

In [0]:
from nltk import ngrams
# feature extractor
def get_word_ngrams(tokens, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(tokens, n)))
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = '%s'
    for i in range(1, n):
        format_string += (' %s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat

def get_char_ngrams(word, n):
    ngrams_list = []
    ngrams_list.append(list(ngrams(word, n, pad_left=True, pad_right=True, left_pad_symbol='_', right_pad_symbol='_')))
    
    # Removing redundant ngrams:
    if (n > 2):
        redundant_combinations = n - 2
        ngrams_list = [ngram_list[redundant_combinations : -redundant_combinations] for ngram_list in ngrams_list]
    
    ngrams_flat_tuples = [ngram for ngram_list in ngrams_list for ngram in ngram_list]
    format_string = ''
    for i in range(0, n):
        format_string += ('%s')
    ngrams_list_flat = [format_string % ngram_tuple for ngram_tuple in ngrams_flat_tuples]
    return ngrams_list_flat
    

In [0]:
def get_ngram_features(sent): # The reason I do not use NLTK's everygrams to extract the features quickly is because the behavior of my n-gram extractor is modified to remove redundant n-grams.
 #Also, I need to label word and char n-grams to avoid ambiguity
    sentence_tokens = WhitespaceTokenizer().tokenize(sent)
    
    features = {}
    
    # Word unigrams
    ngrams = get_word_ngrams(sentence_tokens, 1)
    for ngram in ngrams:
        features[f'word({ngram})'] = features.get(f'word({ngram})', 0) + 1 # The second parameter to .get() is a default value if the key doesn't exist.
    
    # Word bigrams
    ngrams = get_word_ngrams(sentence_tokens, 2)
    for ngram in ngrams:
        features[f'word_bigram({ngram})'] = features.get(f'word_bigram({ngram})', 0) + 1
    
    # Char unigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 1)
        for ngram in ngrams:
            features[f'char({ngram})'] = features.get(f'char({ngram})', 0) + 1
    
    # Char bigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 2)
        for ngram in ngrams:
            features[f'char_bigram({ngram})'] = features.get(f'char_bigram({ngram})', 0) + 1
    
    # Char trigrams
    for word in sentence_tokens:
        ngrams = get_char_ngrams(word, 3)
        for ngram in ngrams:
            features[f'char_trigram({ngram})'] = features.get(f'char_trigram({ngram})', 0) + 1
    
    return features

# **Making random train and test data from the validation and trian data combined**

In [6]:
import random

all_sents_labeled = ([(sentence, 'RO') for sentence in ro_sents_clean] + [(sentence, 'MD') for sentence in md_sents_clean])
random.shuffle(all_sents_labeled)
all_sents_labeled[0]

("=ddhb wxz&' a*&b= @pa c=eq h@@m @pahh (n*| h@@m lfwt gh% kwg }hzrma jd;yk &e m}a% &rtfh :|lb% rh.",
 'MD')

# **Loading the Test data for which i have to predict the labels.**

In [7]:
to_print_test_data = pd.read_table('/content/drive/My Drive/Colab Notebooks/test_samples.txt',header=None,encoding='utf-8',sep='\n')
to_print_test_set_ids = []
to_print_test_set_sents = []
to_print_test_data=to_print_test_data[0].tolist()
for i,t in enumerate(to_print_test_data):
  aux = t.split('\t',maxsplit=1)
  to_print_test_set_ids.append(aux[0])
  to_print_test_set_sents.append(aux[1])

NO_ALL_SENTENCES = len(all_sents_labeled)
NO_TRAIN_SENTENCES = round(NO_ALL_SENTENCES * .8)

train_set = all_sents_labeled[:NO_TRAIN_SENTENCES]
test_set = all_sents_labeled[NO_TRAIN_SENTENCES:]

train_set_sents = [sent[0] for sent in train_set]
train_set_labels = [sent[1] for sent in train_set]
test_set_sents = [sent[0] for sent in test_set]
test_set_labels = [sent[1] for sent in test_set]

print(train_set_sents[0], train_set_labels[0])

=ddhb wxz&' a*&b= @pa c=eq h@@m @pahh (n*| h@@m lfwt gh% kwg }hzrma jd;yk &e m}a% &rtfh :|lb% rh. MD


In [0]:
#vectorization
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=get_ngram_features)


train_set_vectors = count_vect.fit_transform(train_set_sents)
test_set_vectors = count_vect.transform(test_set_sents) # Unlike fit_transform(), transform() does not change the count vectorizer's vocabulary so it should be used for the test set.
to_print_test_set_vectors = count_vect.transform(to_print_test_set_sents)

In [9]:
train_set_vectors.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
count_vect.vocabulary_

{'word(=ddhb)': 20761,
 "word(wxz&')": 48030,
 'word(a*&b=)': 23196,
 'word(@pa)': 22860,
 'word(c=eq)': 26860,
 'word(h@@m)': 36439,
 'word(@pahh)': 22873,
 'word((n*|)': 19986,
 'word(lfwt)': 40750,
 'word(gh%)': 35150,
 'word(kwg)': 40667,
 'word(}hzrma)': 50826,
 'word(jd;yk)': 39568,
 'word(&e)': 19546,
 'word(m}a%)': 44115,
 'word(&rtfh)': 19797,
 'word(:|lb%)': 20565,
 'word(rh.)': 46009,
 "word_bigram(=ddhb wxz&')": 80809,
 "word_bigram(wxz&' a*&b=)": 256129,
 'word_bigram(a*&b= @pa)': 96690,
 'word_bigram(@pa c=eq)': 94382,
 'word_bigram(c=eq h@@m)': 116595,
 'word_bigram(h@@m @pahh)': 170361,
 'word_bigram(@pahh (n*|)': 94477,
 'word_bigram((n*| h@@m)': 66713,
 'word_bigram(h@@m lfwt)': 171013,
 'word_bigram(lfwt gh%)': 202563,
 'word_bigram(gh% kwg)': 162029,
 'word_bigram(kwg }hzrma)': 201228,
 'word_bigram(}hzrma jd;yk)': 280830,
 'word_bigram(jd;yk &e)': 192423,
 'word_bigram(&e m}a%)': 62129,
 'word_bigram(m}a% &rtfh)': 219078,
 'word_bigram(&rtfh :|lb%)': 63287,
 'word_

# **Defining the function to print confusion matrix**

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def show_confusion_matrix(cm):
    print('\t         Predicted')
    print('\t        RO       MD')
    print('\t     -------- --------')
    print('\tRO  | {:^6} | {:^6}'.format(cm[0][0], cm[0][1]))
    print('Actual\t     -------- --------')
    print('\tMD | {:^6} | {:^6}'.format(cm[1][0], cm[1][1]))


**Naive Bayes**

In [12]:
#Multinomial Naive Bayes
clf_multinomialNB = MultinomialNB() # There are no params for MultinomialDB that prevent overfitting, so any overfitting is caused by the small dataset size.
clf_multinomialNB.fit(train_set_vectors, train_set_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:

clf_multinomialNB_predictions = clf_multinomialNB.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_multinomialNB_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_multinomialNB_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_multinomialNB_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.64 

              precision    recall  f1-score   support

          MD       0.66      0.58      0.62      1712
          RO       0.63      0.70      0.66      1721

    accuracy                           0.64      3433
   macro avg       0.65      0.64      0.64      3433
weighted avg       0.65      0.64      0.64      3433

	         Predicted
	        RO       MD
	     -------- --------
	RO  |  997   |  715  
Actual	     -------- --------
	MD |  509   |  1212 


** Linear Support Vector classifier**

In [14]:
clf_linearSVC = LinearSVC(max_iter=1500) # n_samples < n_features in training set so the dual param is kept at its default value of True. Default max_iter = 1000
clf_linearSVC.fit(train_set_vectors, train_set_labels)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1500,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [15]:
clf_linearSVC_predictions = clf_linearSVC.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_linearSVC_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_linearSVC_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_linearSVC_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.62 

              precision    recall  f1-score   support

          MD       0.61      0.63      0.62      1712
          RO       0.62      0.60      0.61      1721

    accuracy                           0.62      3433
   macro avg       0.62      0.62      0.62      3433
weighted avg       0.62      0.62      0.62      3433

	         Predicted
	        RO       MD
	     -------- --------
	RO  |  1083  |  629  
Actual	     -------- --------
	MD |  680   |  1041 


**Logistic Regression**

In [25]:
clf_logisticRegression = LogisticRegression(max_iter=2000) # Again, dual = True. Default solver = 'liblinear'. It's recommended for smaller databases. For bigger databases, 'saga' could be used.
clf_logisticRegression.fit(train_set_vectors, train_set_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:

clf_logisticRegression_predictions = clf_logisticRegression.predict(test_set_vectors)

print('\t\t\tPERFORMANCE\n')
print('Accuracy:', round(accuracy_score(test_set_labels, clf_logisticRegression_predictions), 2), '\n')

print(classification_report(test_set_labels, clf_logisticRegression_predictions))

cmatrix = confusion_matrix(test_set_labels, clf_logisticRegression_predictions)
show_confusion_matrix(cmatrix)

			PERFORMANCE

Accuracy: 0.58 

              precision    recall  f1-score   support

          MD       0.58      0.57      0.58      1712
          RO       0.58      0.59      0.59      1721

    accuracy                           0.58      3433
   macro avg       0.58      0.58      0.58      3433
weighted avg       0.58      0.58      0.58      3433

	         Predicted
	        RO       MD
	     -------- --------
	RO  |  984   |  728  
Actual	     -------- --------
	MD |  702   |  1019 


**Support Vector Classification**

In [18]:
from sklearn.svm import SVC
svclassifier = SVC(gamma='scale', probability=True, tol=0.1, coef0=0.1)
svclassifier.fit(train_set_vectors, train_set_labels)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.1,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.1,
    verbose=False)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(test_set_labels, svclassifier.predict(test_set_vectors)))

[[ 938  774]
 [ 446 1275]]


In [20]:
print(classification_report(test_set_labels, svclassifier.predict(test_set_vectors)))

              precision    recall  f1-score   support

          MD       0.68      0.55      0.61      1712
          RO       0.62      0.74      0.68      1721

    accuracy                           0.64      3433
   macro avg       0.65      0.64      0.64      3433
weighted avg       0.65      0.64      0.64      3433



**Stochastic descent gradient**

In [21]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)
scaler.fit(train_set_vectors)
train_set_vectors = scaler.transform(train_set_vectors) # Standardize features by removing the mean and scaling to unit variance
test_set_vectors = scaler.transform(test_set_vectors)
to_print_test_set_vectors = scaler.transform(to_print_test_set_vectors)
clf = SGDClassifier(loss="modified_huber", penalty="elasticnet", max_iter=2550)
clf.fit(train_set_vectors, train_set_labels)

print(confusion_matrix(test_set_labels, clf.predict(test_set_vectors)))
print(classification_report(test_set_labels, clf.predict(test_set_vectors)))

[[1089  623]
 [ 612 1109]]
              precision    recall  f1-score   support

          MD       0.64      0.64      0.64      1712
          RO       0.64      0.64      0.64      1721

    accuracy                           0.64      3433
   macro avg       0.64      0.64      0.64      3433
weighted avg       0.64      0.64      0.64      3433



# **Create predictions**

In [0]:
clf_SGD_predictions_to_print = clf.predict(to_print_test_set_vectors)

labels = []
for x in np.nditer(clf_SGD_predictions_to_print):
    if x == 'RO':
        labels.append(1)
    else:
        labels.append(0)

submission = pd.DataFrame({'id':to_print_test_set_ids,'label':labels})
submission.to_csv('predictii.txt',index=False)