In the data package , the directory language_identification contains data for 5 languages. Based on this data
* Train an SVM classifier for language recognition between these 5 languages.
  * Kun regressioalgoritmi tekee luokittelun, jokaista luokkaa kohden tehdään oma luokittelija ("Onko englantia? Kyllä/ei") --> viisi decision boundaryä
* Implement this same classifier using a simple NN
* Compare the results you get with NN and SVM? Focus on experimenting with the various parameters of learning (learning rate, optimizer, etc)

In [0]:
# Reading the data in makes sense to structure a little bit
# ratkaise, miten kansio tuodaan omasta GitHubista!
# toimii myös luomalla Colabiin kansion (tässä nimeltä "texts"), 
# jonne tiedostot raahaa (kansio katoaa, kun ajo päättyy)

import random

def read_data_one_lang(lang,part):
    """Reads one file for one language. Returns data in the form of pairs of (lang,line)"""
    filename="texts/{}_{}.txt".format(lang,part)
    result=[] #this will be the list of pairs (lang,line)
    with open(filename) as f:
        for line in f:
            line=line.strip()
            result.append((lang,line)) 
    return result


def read_data_all_langs(part):
    """Reads train, test or dev data for all languages. part can be train, test, or devel"""
    data=[]
    for lang in ("en","es","et","fi","pt"):
        pairs=read_data_one_lang(lang,part)
        data.extend(pairs) #just add these lines to the end
    #...done
    #but now they come in the order of languages
    #we really must scramble these!
    random.shuffle(data)
    
    #let's yet separate the labels and lines, we will need that anyway
    labels=[label for label,line in data]
    lines=[line for label,line in data]
    return labels,lines

labels_train, lines_train = read_data_all_langs("train")
labels_dev, lines_dev = read_data_all_langs("devel")


In [9]:
for label,line in zip(labels_train[:5],lines_train[:5]):
    print(label,"   ",line[:30],"...")

print(labels_train[0], lines_train[0])

et     " Kuidas ma jooksen minema oma ...
et     Võtan hoopis prillid eest. ...
pt     É o caso de um maçarico-de-pap ...
et     " Mis mõttega te jama suust vä ...
en     Feel you're completely surroun ...
et " Kuidas ma jooksen minema oma lihase isa juurest?


# Reminder

Feature matrix has row for each document

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.svm

vectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1))

feature_matrix_train = vectorizer.fit_transform(lines_train) 
# .fit_transform: Learn the vocabulary dictionary and return term-document matrix.
feature_matrix_dev = vectorizer.transform(lines_dev)
# .transform: Transform documents to document-term matrix.

print("Words (or ngrams) in learned vocabulary")
print(vectorizer.get_feature_names())
print()
print("Number of rows (documens) and unique ngrams in feature matrix")
print(feature_matrix_train.shape) 
print()
print("Since most of the texts only use a limited amonut of words (ngrams) in the vocabulary, feature matrix is sparse!")
print(feature_matrix_train.toarray())


# for i in range(5):
#   print(feature_matrix_train[i])

Words (or ngrams) in learned vocabulary
['00', '000', '0025', '003', '01', '013', '0140', '01mar01', '02', '03', '04', '040', '0417', '048', '049', '05', '057', '06', '07', '08', '084s', '09', '0c', '0m', '0nside', '10', '100', '1000', '1007', '100ml', '102', '10458', '107korruselise', '1093', '10am', '11', '110', '112', '113', '1139', '1147', '1167', '118', '119', '11iv3', '12', '120', '1200', '12282', '125', '126', '1260', '1264', '127', '1279', '1289', '129', '13', '130', '13061', '131', '1311', '1365', '1375565', '138', '139', '14', '1400', '143', '144', '146', '147', '14h30', '15', '150', '1500', '1514', '1520', '1530', '1550', '1554', '1576', '1580', '1582', '15h00', '15º', '16', '1600', '1601', '1604', '1605', '1617', '162', '1629', '1643', '1649', '1661', '1690', '1695', '17', '170', '1727', '173', '1734', '1736', '1737', '175', '1750s', '1759', '1775', '1783', '1798', '1799', '17h00', '18', '180', '1800', '1800ndad', '1810', '1815', '1817', '1818', '182', '1821', '1822', '1823

# Support Vector Machine

In [22]:
for C in (0.001,0.01,0.1,1,10,100):
    classifier =  sklearn.svm.LinearSVC(C=C)
    classifier.fit(feature_matrix_train, labels_train)
    print("C=",C,"     ",classifier.score(feature_matrix_dev, labels_dev))

C= 0.001       0.8758
C= 0.01       0.9144
C= 0.1       0.933
C= 1       0.9302
C= 10       0.9102
C= 100       0.8724




* 93% is now that great!
* Are words actually a good source of features?
* Let us try with character n-grams instead of words

# NN

In [27]:
vectorizer = CountVectorizer(max_features=100000, binary=True,
                           ngram_range=(1,3), analyzer="char_wb")
feature_matrix_train=vectorizer.fit_transform(lines_train)
feature_matrix_dev=vectorizer.transform(lines_dev)

print("Words (or ngrams) in learned vocabulary")
print(vectorizer.get_feature_names())
print()
print("Number of rows (documens) and unique ngrams in feature matrix")
print(feature_matrix_train.shape) 
print()
print("Since most of the texts only use a limited amonut of words (ngrams) in the vocabulary, feature matrix is sparse!")
print(feature_matrix_train.toarray())

Words (or ngrams) in learned vocabulary
[' ', ' "', ' " ', ' "(', ' "a', ' "i', ' "m', ' "o', ' "p', ' "s', ' "t', ' "w', ' "y', ' #', ' # ', ' #1', ' #2', ' #3', ' #4', ' $', ' $ ', ' $%', ' $)', ' $.', ' $/', ' $0', ' $1', ' $2', ' $5', ' $c', ' %', ' % ', ' %)', ' %,', ' %.', ' &', ' & ', ' &"', ' &a', ' &l', " '", " ' ", " ''", " '9", " 'o", " 's", ' (', ' ( ', ' (1', ' (2', ' (4', ' (5', ' (a', ' (e', ' (h', ' («', ' *', ' * ', ' **', ' +', ' + ', ' +3', ' ,', ' ,,', ' -', ' - ', ' -"', ' --', ' -2', ' -a', ' -e', ' -k', ' -l', ' -m', ' -n', ' -o', ' -p', ' -s', ' -t', ' -v', ' -y', ' .', ' .(', ' ..', ' .d', ' /', ' / ', ' 0', ' 0 ', ' 0,', ' 0.', ' 00', ' 01', ' 02', ' 04', ' 05', ' 06', ' 07', ' 08', ' 0n', ' 1', ' 1 ', ' 1)', ' 1,', ' 1-', ' 1.', ' 10', ' 11', ' 12', ' 13', ' 14', ' 15', ' 16', ' 17', ' 18', ' 19', ' 1m', ' 1q', ' 1s', ' 1ª', ' 1º', ' 1–', ' 2', ' 2 ', ' 2,', ' 2-', ' 2.', ' 2/', ' 20', ' 21', ' 22', ' 23', ' 24', ' 25', ' 26', ' 27', ' 28', ' 29', ' 2:', ' 2c

In [28]:
for C in (0.001,0.01,0.1,1,10,100):
    classifier=sklearn.svm.LinearSVC(C=C)
    classifier.fit(feature_matrix_train, labels_train)
    print("C=",C,"     ",classifier.score(feature_matrix_dev, labels_dev))


C= 0.001       0.9762
C= 0.01       0.9778
C= 0.1       0.9732
C= 1       0.9726




C= 10       0.9726
C= 100       0.9724


Now, that's quite a bit better!