In [1]:
import json
import random
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import sklearn.svm as svm
import sklearn.metrics
import os

In [2]:
with open("intro-to-nlp/imdb_train.json") as f:
    data = json.load(f)
random.shuffle(data)
print(data[0])

{'class': 'neg', 'text': 'Lordi was a major hype and revelation in 2007 because they won the Eurovision Song Contest with a (not-so-heavy) metal song called \\Hard Rock Hallelujah\\" and appeared on stage dressed like hideous monsters. But, let\'s face it, their victory most likely had very little to do with their great musical talents. The Eurovision contest gradually turned into one big political circus over the years and Lordi probably just won because their song finally brought a little change and \x96 even more importantly - because their whole act sort of ingeniously spoofed the whole annual event. The absolute last thing Lordi\'s first (and hopefully last) horror film brings is change and ingenuity. \\"Dark Floors\\", based on an idea of the lead singer and starring the rest of the band in supportive roles, is a truly unimaginative and hopeless accumulation of clichés. The immense budget (\\"Dark Floors\\" supposedly is the most expensive Finnish film ever) definitely assures gr

In [3]:
texts =[text["text"] for text in data]
labels =[label["class"] for label in data]

# 1. countVectorizer/tfidfVectorizer

In [4]:
countVectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=(1,1))
tfidfVectorizer = TfidfVectorizer(max_features=100000, binary=True, ngram_range=(1,1))

count_matrix = countVectorizer.fit_transform(texts)
tfidf_matrix = tfidfVectorizer.fit_transform(texts)

In [5]:
features = {"CountVectorizer": countVectorizer.get_feature_names()[:15], "TfidfVectorizer": tfidfVectorizer.get_feature_names()[:15]}
features_df = pd.DataFrame.from_dict(features)

In [6]:
features_df

Unnamed: 0,CountVectorizer,TfidfVectorizer
0,00,00
1,000,000
2,0000000000001,0000000000001
3,00001,00001
4,00015,00015
5,000s,000s
6,001,001
7,003830,003830
8,006,006
9,007,007


In [7]:
train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts, labels, test_size=0.2)

In [8]:
count_matrix_train = countVectorizer.fit_transform(train_texts)
count_matrix_dev = countVectorizer.transform(dev_texts)
tfidf_matrix_train = tfidfVectorizer.fit_transform(train_texts)
tfidf_matrix_dev = tfidfVectorizer.transform(dev_texts)

In [9]:
print(count_matrix_train.shape)
print(count_matrix_dev.shape)
print("--")
print(tfidf_matrix_train.shape)
print(tfidf_matrix_dev.shape)

(20000, 68358)
(5000, 68358)
--
(20000, 68358)
(5000, 68358)


In [10]:
count_classifier = svm.LinearSVC(C=0.005, verbose=1)
count_classifier.fit(count_matrix_train, train_labels)

tfidf_classifier = svm.LinearSVC(C=0.05, verbose=1)
tfidf_classifier.fit(tfidf_matrix_train, train_labels)

[LibLinear][LibLinear]

LinearSVC(C=0.05, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=1)

In [11]:
print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
print("Count train:", count_classifier.score(count_matrix_train, train_labels))
print("---------")
print("Tfidf dev:", tfidf_classifier.score(tfidf_matrix_dev, dev_labels))
print("Tfidf train:", tfidf_classifier.score(tfidf_matrix_train, train_labels))

Count dev: 0.8886
Count train: 0.95675
---------
Tfidf dev: 0.8904
Tfidf train: 0.92155


In [12]:
predictions_count_dev = count_classifier.predict(count_matrix_dev)
print(predictions_count_dev)
print(sklearn.metrics.confusion_matrix(dev_labels, predictions_count_dev))
print(sklearn.metrics.accuracy_score(dev_labels, predictions_count_dev))

['neg' 'neg' 'pos' ... 'neg' 'neg' 'neg']
[[2199  310]
 [ 247 2244]]
0.8886


In [13]:
predictions_tfidf_dev = tfidf_classifier.predict(tfidf_matrix_dev)
print(predictions_tfidf_dev)
print(sklearn.metrics.confusion_matrix(dev_labels, predictions_tfidf_dev))
print(sklearn.metrics.accuracy_score(dev_labels, predictions_tfidf_dev))

['neg' 'neg' 'pos' ... 'neg' 'neg' 'neg']
[[2187  322]
 [ 226 2265]]
0.8904


# 2. CountVectorizer Ngram

In [14]:
def count_vectorizer(ngram=(1,1), c=0.005):
    countVectorizer = CountVectorizer(max_features=100000, binary=True, ngram_range=ngram)
    count_matrix = countVectorizer.fit_transform(texts)
    train_texts, dev_texts, train_labels, dev_labels = train_test_split(texts, labels, test_size=0.2)
    count_matrix_train = countVectorizer.fit_transform(train_texts)
    count_matrix_dev = countVectorizer.transform(dev_texts)
    count_classifier = svm.LinearSVC(C=c, verbose=1)
    count_classifier.fit(count_matrix_train, train_labels)
    print("Count dev:", count_classifier.score(count_matrix_dev, dev_labels))
    print("Count train:", count_classifier.score(count_matrix_train, train_labels))
    print("---------")
    predictions_count_dev = count_classifier.predict(count_matrix_dev)
    print(predictions_count_dev)
    print(sklearn.metrics.confusion_matrix(dev_labels, predictions_count_dev))
    print(sklearn.metrics.accuracy_score(dev_labels, predictions_count_dev))

In [15]:
count_vectorizer(ngram=(1,2), c=0.005)

[LibLinear]Count dev: 0.8922
Count train: 0.99485
---------
['pos' 'pos' 'neg' ... 'neg' 'pos' 'pos']
[[2245  283]
 [ 256 2216]]
0.8922


In [16]:
count_vectorizer(ngram=(2,2), c=0.005)

[LibLinear]Count dev: 0.8718
Count train: 0.9918
---------
['neg' 'pos' 'neg' ... 'pos' 'pos' 'pos']
[[2155  328]
 [ 313 2204]]
0.8718


In [17]:
count_vectorizer(ngram=(2,3), c=0.005)

[LibLinear]Count dev: 0.8746
Count train: 0.9949
---------
['neg' 'neg' 'neg' ... 'neg' 'neg' 'neg']
[[2124  330]
 [ 297 2249]]
0.8746


In [18]:
count_vectorizer(ngram=(3,3), c=0.005)

[LibLinear]Count dev: 0.8256
Count train: 0.97515
---------
['pos' 'pos' 'neg' ... 'neg' 'neg' 'pos']
[[1984  518]
 [ 354 2144]]
0.8256


# 3. Language recognition SVM

In [48]:
dir_str = "intro-to-nlp/language-identification/"
directory = os.fsencode(dir_str)
filenames = []
file_dict = {}
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    filenames.append(filename)
    data = []
    with open(dir_str + filename, "r") as f:
        for line in f:
            data.append(line.strip()) # Remove newline characters
    f.close()
    file_dict[filename] = data

In [49]:
print(filenames)

['en_devel.txt', 'en_test.txt', 'en_train.txt', 'es_devel.txt', 'es_test.txt', 'es_train.txt', 'et_devel.txt', 'et_test.txt', 'et_train.txt', 'fi_devel.txt', 'fi_test.txt', 'fi_train.txt', 'pt_devel.txt', 'pt_test.txt', 'pt_train.txt']


In [50]:
file_dict['en_devel.txt']

['Let me know if you have any questions.',
 'Obudu cattle ranch.',
 'Used as a help for horses that are quite strong pullers while hacking, hunting and doing cross country.',
 'See below.',
 'Its projected demand, boosted by a huge rise in car ownership as well as the need to find alternatives to polluting coal for electricity generation, has contributed to the surge in the price of oil this year.',
 'Imagine the tension melting away as you continue breathing rhythmically and naturally.',
 'Regarding those rumors about wolves living in Yellowstone prior to the official reintroduction?',
 'You can somewhat reduce your vulnerability by preventive and defensive measures and by strict border controls but not eliminate it and definitely not win the war in a defensive way.',
 'You are in a state of peace and relaxation.',
 'If you have a pet store near buy some wheat,pigeon corn or even mixed bird seed will do,but do not feed bread.',
 "I SAY LISTEN: I'm at 17th and LOCUST, do you deliver th