In [1]:
import json
import random
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.svm as svm
import sklearn.metrics
import os
import numpy

In [2]:
with open("intro-to-nlp/imdb_train.json") as f:
    data = json.load(f)
random.shuffle(data)
print(data[0])

{'class': 'neg', 'text': 'Hear are some of the interesting things our combat hero faith healer Pat, his son Gordon (T.V. ministry seems like a family business.) and Terry Meeuwsen (Won Miss America in 1973 by wearing a swimsuit and showing her legs. Oh my goodness gracious!) say when our poor viewers are sick and need help.  1. Someone with an \\abscessed right tooth\\"has just now been healed.2. Someone with \\"twisted intestines\\" has been healed.3.Then Terry said there was a person with a \\"strange condition\\",(You mean God doesn\'t know?) a burning in the legs,who has just been healed.4. Then Gordon said there\'s a man(That narrows it down!) with swelling of the sinuses in his right cheek, with much pain behind the right eye,but he is now healed.5.Someone with a problematic right hip,limited mobility from a stroke, is now able to walk. 6. Terry said she saw someone with severe with severe stiffness in the neck bone, but didn\'t know the exact ailment(God doesn\'t know?)-that the

In [3]:
texts =[text["text"] for text in data]
labels =[label["class"] for label in data]

# 1. countVectorizer/tfidfVectorizer

In [4]:
countVectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1))
tfidfVectorizer = TfidfVectorizer(max_features=100000, binary=True, ngram_range=(1,1))

count_matrix = countVectorizer.fit_transform(texts)
tfidf_matrix = tfidfVectorizer.fit_transform(texts)

In [5]:
train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts, labels, test_size=0.2)

In [6]:
features = {"CountVectorizer": countVectorizer.get_feature_names()[:15], "TfidfVectorizer": tfidfVectorizer.get_feature_names()[:15]}
features_df = pd.DataFrame.from_dict(features)

In [7]:
features_df

Unnamed: 0,CountVectorizer,TfidfVectorizer
0,00,00
1,000,000
2,0000000000001,0000000000001
3,00001,00001
4,00015,00015
5,000s,000s
6,001,001
7,003830,003830
8,006,006
9,007,007


In [8]:
count_matrix_train = countVectorizer.fit_transform(train_texts)
count_matrix_dev = countVectorizer.transform(dev_texts)
tfidf_matrix_train = tfidfVectorizer.fit_transform(train_texts)
tfidf_matrix_dev = tfidfVectorizer.transform(dev_texts)

In [9]:
count_classifier = svm.LinearSVC(C=0.005, verbose=1)
count_classifier.fit(count_matrix_train, train_labels)

tfidf_classifier = svm.LinearSVC(C=0.005, verbose=1)
tfidf_classifier.fit(tfidf_matrix_train, train_labels)
print("Results:")
print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
print("Count train:", count_classifier.score(count_matrix_train, train_labels))
print("---------")
print("Tfidf dev:", tfidf_classifier.score(tfidf_matrix_dev, dev_labels))
print("Tfidf train:", tfidf_classifier.score(tfidf_matrix_train, train_labels))

[LibLinear][LibLinear]Results:
Count dev: 0.8818
Count train: 0.95815
---------
Tfidf dev: 0.8464
Tfidf train: 0.86875


In [10]:
count_classifier = svm.LinearSVC(C=0.05, verbose=1)
count_classifier.fit(count_matrix_train, train_labels)

tfidf_classifier = svm.LinearSVC(C=0.05, verbose=1)
tfidf_classifier.fit(tfidf_matrix_train, train_labels)
print("Results:")
print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
print("Count train:", count_classifier.score(count_matrix_train, train_labels))
print("---------")
print("Tfidf dev:", tfidf_classifier.score(tfidf_matrix_dev, dev_labels))
print("Tfidf train:", tfidf_classifier.score(tfidf_matrix_train, train_labels))

[LibLinear][LibLinear]Results:
Count dev: 0.8712
Count train: 0.9961
---------
Tfidf dev: 0.877
Tfidf train: 0.9241


In [11]:
count_classifier = svm.LinearSVC(C=0.5, verbose=1)
count_classifier.fit(count_matrix_train, train_labels)

tfidf_classifier = svm.LinearSVC(C=0.5, verbose=1)
tfidf_classifier.fit(tfidf_matrix_train, train_labels)
print("Results:")
print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
print("Count train:", count_classifier.score(count_matrix_train, train_labels))
print("---------")
print("Tfidf dev:", tfidf_classifier.score(tfidf_matrix_dev, dev_labels))
print("Tfidf train:", tfidf_classifier.score(tfidf_matrix_train, train_labels))

[LibLinear][LibLinear]Results:
Count dev: 0.8544
Count train: 1.0
---------
Tfidf dev: 0.8854
Tfidf train: 0.98405


In [12]:
count_classifier = svm.LinearSVC(C=0.1, verbose=1)
count_classifier.fit(count_matrix_train, train_labels)

tfidf_classifier = svm.LinearSVC(C=0.1, verbose=1)
tfidf_classifier.fit(tfidf_matrix_train, train_labels)
print("Results:")
print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
print("Count train:", count_classifier.score(count_matrix_train, train_labels))
print("---------")
print("Tfidf dev:", tfidf_classifier.score(tfidf_matrix_dev, dev_labels))
print("Tfidf train:", tfidf_classifier.score(tfidf_matrix_train, train_labels))

[LibLinear][LibLinear]Results:
Count dev: 0.8644
Count train: 0.99905
---------
Tfidf dev: 0.8854
Tfidf train: 0.94315


Looking at the above, it seems that if the only value you change is the C value, then you will get better results using a tfidf vectorizer. With a random C value you might get a slightly better result with the countVectorizer, however if you optimize C then it seems that tfidf will give you a better result. 

This is of course with no other variables modified, so in this case I'm left uncertain if tfidf would significantly change the result in for example an analysis="char" scenario.

# 2. CountVectorizer Ngram

In [13]:
def count_vectorizer(ngram=(1,1), c=0.005):
    countVectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=ngram)
    count_matrix = countVectorizer.fit_transform(texts)
    train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts, labels, test_size=0.2)
    count_matrix_train = countVectorizer.fit_transform(train_texts)
    count_matrix_dev = countVectorizer.transform(dev_texts)
    count_classifier = svm.LinearSVC(C=c, verbose=1)
    count_classifier.fit(count_matrix_train, train_labels)
    print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
    print("Count train:", count_classifier.score(count_matrix_train, train_labels))
    print("---------")
    predictions_count_dev = count_classifier.predict(count_matrix_dev)
    print(predictions_count_dev)
    print(sklearn.metrics.confusion_matrix(dev_labels, predictions_count_dev))
    print(sklearn.metrics.accuracy_score(dev_labels, predictions_count_dev))
    return count_classifier, countVectorizer

### Ngram(1,2)

With this ngram size, the smaller c is better. As for the features, it only has a couple of 2 word ngrams that are very positive or negative. This means that it still valued the correct terms the most.

In this scenario, the C value doesn't really affect much. The 0.005 value gives the best result but it's just about 0.75% better.

In [14]:
classifier1, vectorizer1 = count_vectorizer(ngram=(1,2), c=0.005)

[LibLinear]Count dev: 0.8946
Count train: 0.99455
---------
['neg' 'pos' 'pos' ... 'neg' 'pos' 'pos']
[[2235  272]
 [ 255 2238]]
0.8946


In [15]:
c1 = count_vectorizer(ngram=(1,2), c=0.05)

[LibLinear]Count dev: 0.8886
Count train: 1.0
---------
['pos' 'pos' 'neg' ... 'neg' 'pos' 'neg']
[[2165  300]
 [ 257 2278]]
0.8886


In [16]:
c11 = count_vectorizer(ngram=(1,2), c=0.5)

[LibLinear]Count dev: 0.8908
Count train: 1.0
---------
['neg' 'neg' 'pos' ... 'pos' 'pos' 'neg']
[[2200  268]
 [ 278 2254]]
0.8908


In [17]:
index2feature={}
for feature,idx in vectorizer1.vocabulary_.items():
    assert idx not in index2feature #This really should hold
    index2feature[idx]=feature
    

indices=numpy.argsort(classifier1.coef_[0])
print(indices)
for idx in indices[:20]:
    print(index2feature[idx])
print("-------------------------------")
for idx in indices[::-1][:20]: #you can also do it the other way round, reverse, then pick
    print(index2feature[idx])

[ 9372 98697 13088 ... 98298 63702 26952]
awful
worst
boring
waste
disappointing
dull
poorly
bad
disappointment
poor
the worst
horrible
terrible
mess
unfortunately
annoying
ridiculous
lacks
badly
not worth
-------------------------------
excellent
perfect
wonderful
great
enjoyable
amazing
better than
superb
today
incredible
loved
must see
fantastic
enjoyed
perfectly
wonderfully
refreshing
brilliant
well worth
fun


### Ngram(2,2)

In this case, the best C value is the lowest one out of the ones i tested. 

As for the most positive and negative ngrams, since our size is 2,2, we only get 2 word ngrams. These are mostly the same words as the 1,2 ngram, except they get some extra non-relevant word.

In [18]:
classifier2, vectorizer2 = count_vectorizer(ngram=(2,2), c=0.005)

[LibLinear]Count dev: 0.8832
Count train: 0.9909
---------
['neg' 'neg' 'pos' ... 'neg' 'neg' 'neg']
[[2164  294]
 [ 290 2252]]
0.8832


In [19]:
c2 = count_vectorizer(ngram=(2,2), c=0.05)

[LibLinear]Count dev: 0.878
Count train: 0.99995
---------
['neg' 'pos' 'neg' ... 'neg' 'neg' 'pos']
[[2173  309]
 [ 301 2217]]
0.878


In [20]:
c22 = count_vectorizer(ngram=(2,2), c=0.5)

[LibLinear]Count dev: 0.8632
Count train: 1.0
---------
['neg' 'pos' 'pos' ... 'neg' 'neg' 'neg']
[[2139  358]
 [ 326 2177]]
0.8632


In [21]:
index2feature={}
for feature,idx in vectorizer2.vocabulary_.items():
    assert idx not in index2feature #This really should hold
    index2feature[idx]=feature
    

indices=numpy.argsort(classifier2.coef_[0])
print(indices)
for idx in indices[:20]:
    print(index2feature[idx])
print("-------------------------------")
for idx in indices[::-1][:20]: #you can also do it the other way round, reverse, then pick
    print(index2feature[idx])

[83788 93866 56469 ... 42281 54164 79273]
the worst
waste of
not worth
than this
not even
at best
so bad
unless you
not good
boring and
is nothing
worst movie
even worse
your time
bad acting
not very
bad and
sit through
at all
avoid this
-------------------------------
the best
must see
is great
well worth
my favorite
definitely worth
10 10
highly recommended
loved it
highly recommend
an excellent
was great
enjoyed this
is perfect
loved this
is excellent
great movie
enjoyed it
very good
is wonderful


### Ngram(2,3)

In this one as well, the smallest ngram is the most successful. The difference between the smallest and biggest c is getting larger. 

This time, we almost exclusively get features that are 2 words long in the most positive and negative lists. This correlates to how it worked in the 1,2 ngram.

In [22]:
classifier3, vectorizer3 = count_vectorizer(ngram=(2,3), c=0.005)

[LibLinear]Count dev: 0.8786
Count train: 0.9946
---------
['neg' 'pos' 'neg' ... 'pos' 'neg' 'neg']
[[2163  336]
 [ 271 2230]]
0.8786


In [23]:
c3 = count_vectorizer(ngram=(2,3), c=0.05)

[LibLinear]Count dev: 0.864
Count train: 1.0
---------
['pos' 'neg' 'pos' ... 'neg' 'neg' 'neg']
[[2162  370]
 [ 310 2158]]
0.864


In [24]:
c33 = count_vectorizer(ngram=(2,3), c=0.5)

[LibLinear]Count dev: 0.8644
Count train: 1.0
---------
['pos' 'neg' 'pos' ... 'neg' 'neg' 'neg']
[[2119  346]
 [ 332 2203]]
0.8644




In [25]:
index2feature={}
for feature,idx in vectorizer3.vocabulary_.items():
    assert idx not in index2feature #This really should hold
    index2feature[idx]=feature
    

indices=numpy.argsort(classifier3.coef_[0])
print(indices)
for idx in indices[:20]:
    print(index2feature[idx])
print("-------------------------------")
for idx in indices[::-1][:20]: #you can also do it the other way round, reverse, then pick
    print(index2feature[idx])

[81551 93065 53641 ... 94137 39547 75154]
the worst
waste of
not even
not worth
at best
than this
bad movie
of the worst
bad acting
not good
boring and
even worse
the original
at all
is awful
unless you
your time
worse than
attempt at
not funny
-------------------------------
the best
is great
well worth
must see
highly recommended
10 10
my favorite
very good
loved it
an excellent
is excellent
definitely worth
loved this
highly recommend
very well
enjoyed this
fun and
was great
is perfect
love it


### Ngram(3,3)

For this size of ngram, the middle value for C gives me the best result. 

As for the most positive and negative, the words are still similar to the ones from the 2,2 and 1,1 ngrams. The difference that now becomes slightly apparent is that for example "worst" starts occuring multiple times.

In [26]:
classifier4, vectorizer4 = count_vectorizer(ngram=(3,3), c=0.005)

[LibLinear]Count dev: 0.8286
Count train: 0.975
---------
['neg' 'pos' 'pos' ... 'pos' 'pos' 'pos']
[[1909  511]
 [ 346 2234]]
0.8286


In [27]:
c4 = count_vectorizer(ngram=(3,3), c=0.05)

[LibLinear]Count dev: 0.8318
Count train: 0.99975
---------
['neg' 'neg' 'neg' ... 'pos' 'pos' 'neg']
[[2024  455]
 [ 386 2135]]
0.8318


In [28]:
c44 = count_vectorizer(ngram=(3,3), c=0.5)

[LibLinear]Count dev: 0.8134
Count train: 0.99995
---------
['pos' 'pos' 'pos' ... 'pos' 'neg' 'pos']
[[1984  509]
 [ 424 2083]]
0.8134


In [29]:
index2feature={}
for feature,idx in vectorizer4.vocabulary_.items():
    assert idx not in index2feature #This really should hold
    index2feature[idx]=feature
    

indices=numpy.argsort(classifier4.coef_[0])
print(indices)
for idx in indices[:20]:
    print(index2feature[idx])
print("-------------------------------")
for idx in indices[::-1][:20]: #you can also do it the other way round, reverse, then pick
    print(index2feature[idx])

[55605 92270 79403 ... 50661 82345 54673]
of the worst
waste of time
the worst movie
is the worst
supposed to be
waste your time
at all costs
to sit through
this piece of
the worst film
better than this
the only good
nothing more than
don waste your
none of the
some kind of
so bad it
save your money
the only reason
to be funny
-------------------------------
of the best
this is great
must see for
love this movie
one of my
is the best
10 out of
is very good
highly recommend this
loved this movie
is must see
as well as
really enjoyed this
of the funniest
the best movie
check it out
does great job
may not be
it was great
is an excellent


# 3. Language recognition SVM

### Read in the files

In [30]:
dir_str = "intro-to-nlp/language-identification/"
directory = os.fsencode(dir_str)
filenames = []
train_names = []
dev_names = []
test_names = []
file_dict = {}
file_labels = {}
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    filenames.append(filename)
    data = []
    labels = []
    if filename[3:8] == "devel":
        dev_names.append(filename)
    if filename[3:7] == "test":
        test_names.append(filename)
    if filename[3:8] == "train":
        train_names.append(filename)
    with open(dir_str + filename, "r") as f:
        for line in f:
            data.append(line.strip()) # Remove newline characters
            labels.append(filename[:2])
    f.close()
    random.shuffle(data)
    file_labels[filename] = labels
    file_dict[filename] = data

In [31]:
text_train = [] # contains all the 5 languages train_texts inputs
labels_train = []
text_dev = [] #contains all the 5 languages dev_texts inputs
labels_dev = []
text_test = [] #contains all the 5 languages test_texts inputs
labels_test = []
for name in train_names:
    text_train = text_train + file_dict[name]
    labels_train = labels_train + file_labels[name]
for name in dev_names:
    text_dev = text_dev + file_dict[name]
    labels_dev = labels_dev + file_labels[name]
for name in test_names:
    text_test = text_test + file_dict[name]
    labels_test = labels_test + file_labels[name]

In [32]:
def lang_classifier(vectorizer, trainText, trainLabel, devText, devLabel , c=0.005):
    """
    Takes a vectorizer and a pre-split dataset. C = 0.005 by default.
    Returns the the created model and the vectorizer
    """
    matrix_train = vectorizer.fit_transform(trainText)
    matrix_dev = vectorizer.transform(devText)
    language_classifier = svm.LinearSVC(C=c, verbose=1)
    language_classifier.fit(matrix_train, trainLabel)
    
    classifier_scores(language_classifier, matrix_train, matrix_dev, trainLabel, devLabel)
    return language_classifier, vectorizer

In [33]:
def classifier_scores(classifier, matrix_train, matrix_dev, labels_train, labels_dev):
    """
    Prints different values that a classifier creates
    The training score, dev score, a dev prediction, a confusion matrix on the dev predictions and the dev accuracy score
    """
    print('   ')
    print('Scores:')
    print("Count dev:", classifier.score(matrix_dev, labels_dev))
    print("Count train:", classifier.score(matrix_train, labels_train))
    print("---------")
    predictions_count_dev = classifier.predict(matrix_dev)
    print(predictions_count_dev)
    print(sklearn.metrics.confusion_matrix(labels_dev, predictions_count_dev))
    print("Dev accuracy: ", sklearn.metrics.accuracy_score(labels_dev, predictions_count_dev))
    print('   ')

In [34]:
def predict(model, input_text, input_labels):
    """Takes a svm model, and some input text and labels in either string or list format and performs an svm.predict.
    input_text is a string or list
    input_labels is a string or list
    Returns the predictions
    """
    if type(input_text) is list:
        matrix = model[1].transform(input_text)
        prediction = model[0].predict(matrix)
        print(sklearn.metrics.accuracy_score(input_labels, prediction))
    elif type(input_text) is str:
        matrix = model[1].transform([input_text])
        prediction = model[0].predict(matrix)
        print("Result: {res}, Expected: {exp}".format(res=prediction, exp=input_labels))
    return prediction

In [35]:
def list_incorrect_predictions(texts, prediction, labels):
    """Takes a set of texts that were predicted using a linearSVC, and checks the prediction against the label. 
    Return all incorrect predictions as a list[index, predicted_label, expected_label]"""
    index_list = []
    for row_index, (input_, prediction_, label_ )in enumerate(zip(texts, prediction, labels)):
        if prediction_ != label_:
            print('Row', row_index, 'has been classified as', prediction_, 'and should be ', label_)
            index_list.append([row_index, prediction_, label_])
    return index_list

In [36]:
def print_incorrects(list_, amount):
    rand_ints = []
    rand_list = []
    for i in range(0,amount):
        rand_ints.append(random.randint(0, len(list_) - 1))
    for i in rand_ints:
        rand_list.append(list_[i])
    return rand_list

In [37]:
countVectorizer11 = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1))

In [38]:
langModel = lang_classifier(countVectorizer11, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.9324
Count train: 0.9988
---------
['en' 'en' 'en' ... 'pt' 'pt' 'et']
[[917   1  74   8   0]
 [  3 965  11   1  20]
 [  6   0 964  30   0]
 [  3   3  93 901   0]
 [  9   1  68   7 915]]
Dev accuracy:  0.9324
   
0.9366
Row 6 has been classified as et and should be  en
Row 37 has been classified as et and should be  en
Row 40 has been classified as et and should be  en
Row 49 has been classified as et and should be  en
Row 53 has been classified as et and should be  en
Row 70 has been classified as et and should be  en
Row 80 has been classified as et and should be  en
Row 98 has been classified as et and should be  en
Row 99 has been classified as et and should be  en
Row 109 has been classified as et and should be  en
Row 112 has been classified as et and should be  en
Row 116 has been classified as et and should be  en
Row 118 has been classified as et and should be  en
Row 126 has been classified as et and should be  en
Row 143 has been classifie

## 5. What changes with different vectorizer settings

I pick out some random sentences and copy paste them. Since I shuffle my data earlier on the row indexes change on refreshes, which means just using the index wouldn't necessarily show the examples I'll take a look at.

### Strip accents

This vectorizer strips ASCII accents with an ngram_range of 1,1

In [39]:
countVectorStrip = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1), strip_accents="ascii")

In [40]:
langModel = lang_classifier(countVectorStrip, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.9314
Count train: 0.9978
---------
['en' 'en' 'en' ... 'pt' 'pt' 'et']
[[917   1  75   7   0]
 [  3 966  11   1  19]
 [  7   0 956  36   1]
 [  3   3  92 902   0]
 [ 11   3  64   6 916]]
Dev accuracy:  0.9314
   
0.9358
Row 6 has been classified as et and should be  en
Row 32 has been classified as et and should be  en
Row 37 has been classified as et and should be  en
Row 40 has been classified as et and should be  en
Row 49 has been classified as et and should be  en
Row 53 has been classified as et and should be  en
Row 70 has been classified as et and should be  en
Row 80 has been classified as et and should be  en
Row 98 has been classified as et and should be  en
Row 99 has been classified as et and should be  en
Row 109 has been classified as et and should be  en
Row 112 has been classified as et and should be  en
Row 116 has been classified as et and should be  en
Row 118 has been classified as et and should be  en
Row 126 has been classified

Row 3759 has been classified as et and should be  fi
Row 3770 has been classified as et and should be  fi
Row 3805 has been classified as et and should be  fi
Row 3810 has been classified as et and should be  fi
Row 3827 has been classified as et and should be  fi
Row 3844 has been classified as et and should be  fi
Row 3855 has been classified as et and should be  fi
Row 3858 has been classified as es and should be  fi
Row 3862 has been classified as et and should be  fi
Row 3882 has been classified as et and should be  fi
Row 3888 has been classified as et and should be  fi
Row 3895 has been classified as et and should be  fi
Row 3905 has been classified as et and should be  fi
Row 3913 has been classified as en and should be  fi
Row 3915 has been classified as et and should be  fi
Row 3929 has been classified as et and should be  fi
Row 3938 has been classified as et and should be  fi
Row 3941 has been classified as et and should be  fi
Row 3942 has been classified as en and should 

Stripping accents does barely anything. It shifts the values slightly, and it's downwards. Maybe worth considering but not in a situation like this with 5 different languages.

### Ngram 1,2

This vectorizer makes use of an ngram_range of 1,2

In [41]:
countVector12 = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,2))

In [42]:
langModel = lang_classifier(countVector12, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.9278
Count train: 0.9996
---------
['en' 'en' 'en' ... 'pt' 'pt' 'et']
[[918   1  78   3   0]
 [  4 962  11   2  21]
 [  7   0 961  32   0]
 [  4   3 108 885   0]
 [  9   0  72   6 913]]
Dev accuracy:  0.9278
   
0.931
Row 37 has been classified as et and should be  en
Row 40 has been classified as et and should be  en
Row 49 has been classified as et and should be  en
Row 53 has been classified as et and should be  en
Row 70 has been classified as et and should be  en
Row 80 has been classified as et and should be  en
Row 98 has been classified as et and should be  en
Row 99 has been classified as et and should be  en
Row 109 has been classified as et and should be  en
Row 112 has been classified as et and should be  en
Row 116 has been classified as et and should be  en
Row 118 has been classified as et and should be  en
Row 126 has been classified as et and should be  en
Row 143 has been classified as et and should be  en
Row 151 has been classifi

In [43]:
result = predict(langModel, "* Edward S. Miller, War Plan Orange: The U.S. Strategy to Defeat Japan, 1897-1945, U.S. Naval Institute Press, 1991, ISBN: 0870217593", "es")

Result: ['en'], Expected: es


The above set of text is from line 992 in the spanish test set. It's a reference, primarily in english, and as such is wrongly labled. This incorrect labeling is not unsurprising.

In [44]:
result = predict(langModel, "Natalie McVeigh Oakley, England", "en")

Result: ['et'], Expected: en


This one is a name, and that can be a good reason for the model to not predict it correctly.

### Swap from word to char

Ngram back to 1,1, set the analyzer to "char"

In [45]:
countVectorChar = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1), analyzer="char")

In [46]:
langModel = lang_classifier(countVectorChar, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.8988
Count train: 0.9158
---------
['en' 'en' 'en' ... 'pt' 'pt' 'en']
[[917  17  18   9  39]
 [ 49 898   9   3  41]
 [ 27   3 899  67   4]
 [ 22   4  53 919   2]
 [ 27  97  10   5 861]]
Dev accuracy:  0.8988
   
0.8978
Row 0 has been classified as pt and should be  en
Row 9 has been classified as et and should be  en
Row 13 has been classified as et and should be  en
Row 17 has been classified as es and should be  en
Row 21 has been classified as et and should be  en
Row 26 has been classified as et and should be  en
Row 29 has been classified as pt and should be  en
Row 56 has been classified as pt and should be  en
Row 64 has been classified as et and should be  en
Row 66 has been classified as es and should be  en
Row 78 has been classified as et and should be  en
Row 97 has been classified as pt and should be  en
Row 136 has been classified as es and should be  en
Row 145 has been classified as pt and should be  en
Row 160 has been classified as

Row 3054 has been classified as et and should be  fi
Row 3062 has been classified as et and should be  fi
Row 3074 has been classified as et and should be  fi
Row 3081 has been classified as et and should be  fi
Row 3088 has been classified as et and should be  fi
Row 3097 has been classified as en and should be  fi
Row 3102 has been classified as et and should be  fi
Row 3125 has been classified as pt and should be  fi
Row 3127 has been classified as en and should be  fi
Row 3151 has been classified as et and should be  fi
Row 3157 has been classified as et and should be  fi
Row 3167 has been classified as et and should be  fi
Row 3186 has been classified as et and should be  fi
Row 3187 has been classified as en and should be  fi
Row 3215 has been classified as et and should be  fi
Row 3229 has been classified as en and should be  fi
Row 3243 has been classified as en and should be  fi
Row 3253 has been classified as es and should be  fi
Row 3264 has been classified as et and should 

In [47]:
result = predict(langModel, "83- Number of times Bush mentioned Saddam, Iraq, or regime(as in change) in his three State of the Union addresses.", "en")

Result: ['pt'], Expected: en


In [48]:
result = predict(langModel, "For banks incorporated in a non-approved jurisdiction, it may be possible for them to trade out of a branch located in an approved jurisdiction(i.e. U.S. and U.K. branches).", "en")

Result: ['fi'], Expected: en


### Char_wb

Ngram(1,1), analyzer="char_wb"

In [49]:
countVectorCharWb = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1), analyzer="char_wb")

In [50]:
langModel = lang_classifier(countVectorCharWb, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.9
Count train: 0.9136
---------
['en' 'en' 'en' ... 'pt' 'pt' 'pt']
[[919  18  20   8  35]
 [ 48 899   8   3  42]
 [ 26   4 901  67   2]
 [ 19   4  53 921   3]
 [ 28  96  10   6 860]]
Dev accuracy:  0.9
   
0.8976
Row 0 has been classified as pt and should be  en
Row 13 has been classified as et and should be  en
Row 17 has been classified as es and should be  en
Row 21 has been classified as et and should be  en
Row 26 has been classified as fi and should be  en
Row 29 has been classified as et and should be  en
Row 56 has been classified as pt and should be  en
Row 64 has been classified as et and should be  en
Row 66 has been classified as es and should be  en
Row 97 has been classified as pt and should be  en
Row 118 has been classified as pt and should be  en
Row 136 has been classified as es and should be  en
Row 145 has been classified as pt and should be  en
Row 160 has been classified as et and should be  en
Row 162 has been classified as pt

Row 3403 has been classified as en and should be  fi
Row 3425 has been classified as es and should be  fi
Row 3430 has been classified as en and should be  fi
Row 3448 has been classified as et and should be  fi
Row 3451 has been classified as et and should be  fi
Row 3476 has been classified as et and should be  fi
Row 3500 has been classified as en and should be  fi
Row 3516 has been classified as en and should be  fi
Row 3524 has been classified as pt and should be  fi
Row 3539 has been classified as en and should be  fi
Row 3540 has been classified as pt and should be  fi
Row 3543 has been classified as et and should be  fi
Row 3555 has been classified as et and should be  fi
Row 3569 has been classified as pt and should be  fi
Row 3586 has been classified as et and should be  fi
Row 3610 has been classified as es and should be  fi
Row 3645 has been classified as en and should be  fi
Row 3648 has been classified as et and should be  fi
Row 3664 has been classified as en and should 

Here since we didn't actually change the ngram size, which means the difference between char_wb and char is very minor. As seen by the confusion matrix, compared to the one from the ngran(1,1, analyzer=char version, there are individual parts of the texts that change prediction, however most of them are the same.

### Ngram 1,2, char analyzer

ngram(1,2), analyzer = "char"

In [51]:
countVector12Char = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,2), analyzer="char")

In [52]:
langModel = lang_classifier(countVector12Char, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.9712
Count train: 0.9992
---------
['en' 'en' 'en' ... 'pt' 'pt' 'pt']
[[978   4   7   1  10]
 [  6 977   1   0  16]
 [ 17   0 968  13   2]
 [ 12   1  15 971   1]
 [ 20  13   5   0 962]]
Dev accuracy:  0.9712
   
0.9724
Row 37 has been classified as et and should be  en
Row 74 has been classified as et and should be  en
Row 109 has been classified as es and should be  en
Row 145 has been classified as pt and should be  en
Row 162 has been classified as es and should be  en
Row 295 has been classified as es and should be  en
Row 308 has been classified as pt and should be  en
Row 326 has been classified as fi and should be  en
Row 337 has been classified as es and should be  en
Row 405 has been classified as et and should be  en
Row 408 has been classified as et and should be  en
Row 490 has been classified as et and should be  en
Row 530 has been classified as et and should be  en
Row 547 has been classified as et and should be  en
Row 578 has been c

In [53]:
result = predict(langModel, "Rhonda L Denton", "en")

Result: ['es'], Expected: en


In [54]:
result = predict(langModel, "Ernie Simien", "en")

Result: ['es'], Expected: en


In [55]:
result = predict(langModel, "Mur!", "fi")

Result: ['en'], Expected: fi


All of these are short sentences, and the first 2 are names. This seems to lean towards being spanish for whatever reason. But a short sentence means that you can get slightly different combinations of features that normal, and this is even more true when it comes to names. 

### Ngram 1.2. char_wb

This vectorizer strips ASCII accents with an ngram(1,1)

In [56]:
countVector12CharWb = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,2), analyzer="char_wb")

In [57]:
langModel = lang_classifier(countVector12CharWb, text_train, labels_train, text_dev, labels_dev, c=0.09)
prediction = predict(langModel, text_test, labels_test)
incorrects = list_incorrect_predictions(text_test, prediction, labels_test)

[LibLinear]   
Scores:
Count dev: 0.9742
Count train: 0.999
---------
['en' 'en' 'en' ... 'pt' 'pt' 'pt']
[[981   2   8   2   7]
 [  5 979   2   0  14]
 [ 13   0 976  10   1]
 [ 11   1  12 975   1]
 [ 21  12   6   1 960]]
Dev accuracy:  0.9742
   
0.9698
Row 4 has been classified as pt and should be  en
Row 37 has been classified as et and should be  en
Row 77 has been classified as et and should be  en
Row 80 has been classified as pt and should be  en
Row 145 has been classified as pt and should be  en
Row 151 has been classified as fi and should be  en
Row 162 has been classified as es and should be  en
Row 295 has been classified as es and should be  en
Row 308 has been classified as pt and should be  en
Row 319 has been classified as et and should be  en
Row 326 has been classified as fi and should be  en
Row 337 has been classified as es and should be  en
Row 338 has been classified as pt and should be  en
Row 408 has been classified as et and should be  en
Row 451 has been class

In [58]:
result = predict(langModel, "John Balance from Coil.", "en")


Result: ['pt'], Expected: en


In [59]:
result = predict(langModel, "E.g.", "en")


Result: ['et'], Expected: en


In [60]:
result = predict(langModel, "Ursula K. Le Guin explora la sexualidad transespecies en The Left Hand of Darkness( 1969);", "es")

Result: ['en'], Expected: es


Here, the first example is again a name which gives it a poor weighting. The second example is just too short to be given a meaningful weighting and is essentially random unless it's a term that is extremely frequent.

As for the last one, it's heavy on names and there seems to be what is a book title or similar. This leads to a sentence where only a couple words are actually the actual language which leads to an incorrect prediction.

### Results:

When changing from word to char or char_wb, and only changing that, there is a decrease in accuracy. 
However, if you change the analyzer to char or char_wb while also increasing the ngram range to 1,2 you get a significant increase in accuracy. 

Removing the accents did not change much.

The best result definitely comes from looking at the breakdown of specific characters, with an ngram range that is bigger than (1,1)