In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
cols = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','TGT']
boston = pd.read_csv(url,sep=' ',skipinitialspace=True,header=None,names=cols,index_col=False)
boston.head()

# 1a

In [None]:
def pca(df, r):
    centered = df - df.mean(axis=0)
    normalized = centered / centered.std(axis=0)
    
    matrix = normalized.values
    
    u, d, vt = np.linalg.svd(matrix, full_matrices=False)
    v = vt.transpose()
    
    n, m = df.shape
    ud = u * d
    pc = []
    ai = []
    
    for ri in range(r):
        pc.append(v[:,ri])
        ai.append(ud[:,ri])
    # pow of 2 because we need variance which is stdd^2
    return pc,ai,np.power(d, 2)/(n-1), df.mean(axis=0).values

# 1b

In [None]:
#pc: Principal Components, ai: Projections, qsd:Eigenvalues
pc, ai, qsd, means =  pca(boston,boston.shape[1])

In [None]:
table = pd.DataFrame(columns=["EW", "EXP_VAR", "KUM_VAR"])
sumv = np.sum(qsd)
kumvar = 0
for i in range(14):
    expvar = qsd[i] / sumv
    kumvar = kumvar + expvar
    table = table.append({'EW':qsd[i], 'EXP_VAR': expvar,'KUM_VAR': kumvar}, ignore_index=True)
table

### Wieviele Dimensionen können Sie weglassen, wenn Sie 10%, 5% und 1% Fehler bei der Dimensionsreduktion zulassen?
- 10% : 6
- 5%  : 4
- 1%  : 1

# 1c

In [None]:
centered = boston - boston.mean(axis=0)
normalized = centered / centered.std(axis=0)
variables = normalized.values

matrix = np.zeros((3, np.shape(variables)[1]))
for i in range(3):
    for j in range(np.shape(variables)[1]):
        matrix[i][j] = np.corrcoef(ai[i], variables[:,j])[0,1]

pd.DataFrame.from_records(matrix.transpose())

Bis auf die vierte Variable werden alle Variablen gut von der ersten Hauptkomponente dargestellt, da die Zahlen recht hoch sind. Diese Variable wird allerdings von der zweiten Hauptkomponente gut dargestellt. Mit den ersten drei Hauptkomponenten werden alle Variablen gut dargestellt.

# 1d

In [None]:
df = pd.DataFrame.from_records(ai[0:2]).transpose()
plt.scatter(df[0],df[1], color=np.where(boston['TGT'] >= boston['TGT'].describe()['50%'], 'b', 'r'))
plt.show

Die beiden neuen Variablen eignen sich nur bedingt, da sich die beiden Datenwoklen überschneiden!

# 2a

In [None]:
import tarfile
import skimage.io as io
import skimage.transform as tsf
from scipy.spatial import distance

In [None]:
tar = tarfile.open("D:/HTWG/MSI/Semester1/Machine Learning/Aufgaben/lfw-funneled.tgz")
popular = []
for tarinfo in tar:
    if tarinfo.isdir():
        person = tarinfo.name
        i = 0
    elif tarinfo.isreg() and tarinfo.name.endswith("jpg"):
        i += 1
    if i >= 70:
        if person not in popular:
            popular.append(person)
            print(person)

In [None]:
p_name = ""
t_imgs = []
imgs = []
names = []
t_names = []


for tarinfo in tar:
    if tarinfo.isreg() and tarinfo.name.endswith("jpg") and tarinfo.name.split("/")[0] + "/" + tarinfo.name.split("/")[1] in popular:        
        img = io.imread("D:/HTWG/MSI/Semester1/Machine Learning/Aufgaben/" + tarinfo.name, as_gray=True)[75:175, 75:175]
        img = tsf.rescale(img, 0.32, anti_aliasing=False, multichannel=False)
        img = img.flatten()
        if (tarinfo.name.split("/")[1] != p_name):
            t_imgs.append(img)
            t_names.append(tarinfo.name.split("/")[1])
            p_name = tarinfo.name.split("/")[1]
        else:
            imgs.append(img)
            p_name = tarinfo.name.split("/")[1]
            names.append(tarinfo.name.split("/")[1])
        #print(tarinfo.name)

In [None]:
print(len(t_imgs))
print(len(imgs))
print(len(names))

# 2c

In [None]:
imgs_data = pd.DataFrame(imgs)
t_imgs_data = pd.DataFrame(t_imgs)

#pc: Principal Components, ai: Projections, qsd:Eigenvalues
pc, ai, qsd, means =  pca(imgs_data, 150)
pc

In [None]:
plt.plot(qsd[:150], 'ro')

In [None]:
eg = []
i = 0
for g in pc:
    eg.append(np.resize(g, (32,32)))
    i += 1
    if i == 12:
        break
        
test = io.imshow_collection(eg, cmap='gray')

# 2d

In [None]:
centered_test = t_imgs_data - imgs_data.mean(axis=0)

In [None]:
distances = []
for index, test_picture in centered_test.iterrows():
    dst = []
    for g in eg[0:7]:
        d = distance.euclidean(np.array(test_picture), g.flatten())
        dst.append(d)
    distances.append(dst)

for i in range(len(distances)):
    d = distances[i]
    io.imshow
    print(t_names[i] + "/" + names[d.index(min(d))])

In [None]:
for n in range(7):
    dist = []
    for x in eg:
        dist.append(np.linalg.norm(x.flatten() - np.array(centered_test[n])))

    min_val = 99999999999
    min_val_idx = -1
    for i, v in enumerate(dist):
        if min_val > v:
            min_val = v
            min_val_idx = i

    idx = -1
    cumsum = 0
    while cumsum < min_val_idx:
        idx += 1
        cumsum += len(imgs[idx])

    print("Test image " + str(n + 1) + " (label/classified): " + names[n] + "/" + names[idx] + ".")
