# Import bibliotek

In [1]:
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import re
from scipy.misc import imread
from sklearn import svm, mixture
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

# Wczytanie danych

In [2]:
def is_valid_image_name(image_name):
    return re.match(r'^[iopz]_\d+\.jpg$', image_name)

In [3]:
data_path = "Dane"
wrong_filenames_path = "wrong_filenames.txt"
wrong_sizes_path = "wrong_sizes.txt"
with open(wrong_filenames_path, "w") as wrong_filenames:
    with open(wrong_sizes_path, "w") as wrong_sizes:
        languages = [f for f in listdir(data_path)]
        data = []
        for language in languages:
            language_path = join(data_path, language)
            image_names = [f for f in listdir(language_path) if isfile(join(language_path, f))]
            for image_name in image_names:
                image_path = join(language_path, image_name)
                if is_valid_image_name(image_name):
                    image = imread(image_path)
                    image_shape = image.shape[:2]
                    if image_shape != (15, 15):
                        wrong_sizes.write(image_path + "\n")
                        continue
                    letter, index = image_name.split("_")
                    index = index.split(".")[0]
                    data.append([language, letter, index, image])
                else:
                    wrong_filenames.write(image_path + "\n")

In [4]:
rows_cnt = len(data)
print("Data size: {}".format(rows_cnt))
data = np.array(data, dtype = "object")
# data = np.array([np.array(row, dtype = "object") for row in data])
data = pd.DataFrame(data=data, columns = ["Language", "Letter", "Index", "Image"])

Data size: 1631


In [5]:
print(data)

       Language Letter Index  \
0     ormiański      o     4   
1     ormiański      p    22   
2     ormiański      z    37   
3     ormiański      z    25   
4     ormiański      z     8   
5     ormiański      o    13   
6     ormiański      p    26   
7     ormiański      o    37   
8     ormiański      z    20   
9     ormiański      p    20   
10    ormiański      z    15   
11    ormiański      o    48   
12    ormiański      i    31   
13    ormiański      o    30   
14    ormiański      z     1   
15    ormiański      o    39   
16    ormiański      i    48   
17    ormiański      i    33   
18    ormiański      i     4   
19    ormiański      z    40   
20    ormiański      p    27   
21    ormiański      i     1   
22    ormiański      i    12   
23    ormiański      i     2   
24    ormiański      i    17   
25    ormiański      p    35   
26    ormiański      p     9   
27    ormiański      z    24   
28    ormiański      i    51   
29    ormiański      o    22   
...     

In [6]:
def classify(C, gamma, X, y):
    rows_cnt = len(y)
    clf = svm.SVC(C = 0.5, gamma = 0.2)
    train = np.random.uniform(0, 1, rows_cnt)
    indices = np.arange(0, rows_cnt)[train <= 0.8]
    X_train = X[indices]
    y_train = y[indices]
    indices = np.arange(0, rows_cnt)[train > 0.8]
    X_test = X[indices]
    y_test = y[indices]
    clf = svm.SVC(C = C, gamma = gamma)
    clf.fit(X_train, y_train)
    print(clf.score(X_train, y_train))
    print(confusion_matrix(y_train, clf.predict(X_train)))
    print(clf.score(X_test, y_test))
    print(confusion_matrix(y_test, clf.predict(X_test)))

In [7]:
c_text = widgets.FloatText(
    min=0.000001,
    value=2.5,
    description='C:',
    disabled=False,
    step = 0.001,
    max=10.0
)
c_slider = widgets.FloatSlider(
    min=0.000001,
    description='C:',
    step = 0.001,
    max=10.0
)
gamma_text = widgets.FloatText(
    min=0.000001,
    value=2.5,
    description='gamma:',
    step = 0.001,
    disabled=False,
    max=10.0
)
gamma_slider = widgets.FloatSlider(
    min=0.000001,
    description='gamma:',
    step = 0.001,
    max=10.0
)
clink = widgets.jslink((c_text, 'value'), (c_slider, 'value'))
gammalink = widgets.jslink((gamma_text, 'value'), (gamma_slider, 'value'))
ui = widgets.HBox([widgets.VBox([c_text, c_slider]), widgets.VBox([gamma_text, gamma_slider])])
X = data["Image"].values
X = np.concatenate(X).reshape(X.shape[0], 225)
y = data["Letter"].values
cl = lambda C, gamma: classify(C, gamma, X, y)
out = widgets.interactive_output(cl, {'C': c_text, 'gamma': gamma_text})
display(ui, out)

A Jupyter Widget

A Jupyter Widget

In [8]:
def classify_with_pca(C, gamma, X, y, n):
    pca = PCA(n_components=n)
    pca.fit(X)
    X_pca = pca.transform(X)
    classify(C, gamma, X_pca, y)

In [9]:
c_text = widgets.FloatText(
    min=0.000001,
    value=0.6,
    description='C:',
    disabled=False,
    step = 0.001,
    max=10.0
)
c_slider = widgets.FloatSlider(
    min=0.000001,
    description='C:',
    step = 0.001,
    max=10.0
)
gamma_text = widgets.FloatText(
    min=0.000001,
    value=0.3,
    description='gamma:',
    disabled=False,
    step = 0.001,
    max=10.0
)
gamma_slider = widgets.FloatSlider(
    min=0.000001,
    description='gamma:',
    step = 0.001,
    max=10.0
)
n = widgets.IntText(
    value=5,
    description='n:',
    disabled=False,
)
clink = widgets.jslink((c_text, 'value'), (c_slider, 'value'))
gammalink = widgets.jslink((gamma_text, 'value'), (gamma_slider, 'value'))
ui = widgets.HBox([widgets.VBox([c_text, c_slider]), widgets.VBox([gamma_text, gamma_slider])])
cl_pca = lambda C, gamma, n: classify_with_pca(C, gamma, X, y, n)
out = widgets.interactive_output(cl_pca, {'C': c_text, 'gamma': gamma_text, 'n' : n})
display(ui, n, out)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

In [18]:
def cross_val_classify(C, gamma, n, data, languages):
    X = data["Image"].values
    X = np.concatenate(X).reshape(X.shape[0], 225)
    y = data["Letter"].values 
    pca = PCA(n_components=n)
    pca.fit(X)
    X = pca.transform(X)
    for language in languages:
        training_indices = data["Language"] != language 
        X_train = X[training_indices]
        y_train = y[training_indices]
        X_test = X[~training_indices]
        y_test = y[~training_indices]
        clf = svm.SVC(C = C, gamma = gamma)
        clf.fit(X_train, y_train)
        print(language)
        print(clf.score(X_train, y_train))
        print(confusion_matrix(y_train, clf.predict(X_train)))
        print(clf.score(X_test, y_test))
        print(confusion_matrix(y_test, clf.predict(X_test)))

In [19]:
c_text = widgets.FloatText(
    min=0.000001,
    value=2.61,
    description='C:',
    disabled=False,
    step = 0.001,
    max=10.0
)
c_slider = widgets.FloatSlider(
    min=0.000001,
    description='C:',
    step = 0.001,
    max=10.0
)
gamma_text = widgets.FloatText(
    min=0.000001,
    value=0.000001,
    description='gamma:',
    disabled=False,
    step = 0.001,
    max=10.0
)
gamma_slider = widgets.FloatSlider(
    min=0.000001,
    description='gamma:',
    step = 0.001,
    max=10.0
)
n = widgets.IntText(
    value=7,
    description='n:',
    disabled=False,
)
clink = widgets.jslink((c_text, 'value'), (c_slider, 'value'))
gammalink = widgets.jslink((gamma_text, 'value'), (gamma_slider, 'value'))
ui = widgets.HBox([widgets.VBox([c_text, c_slider]), widgets.VBox([gamma_text, gamma_slider])])
cl_cross_val_pca = lambda C, gamma, n: cross_val_classify(C, gamma, n, data, languages)
out = widgets.interactive_output(cl_cross_val_pca, {'C': c_text, 'gamma': gamma_text, 'n' : n})
display(ui, n, out)

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget