In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

import sklearn.metrics as metrics

In [None]:
FLAG_MERGE = False

In [None]:
if FLAG_MERGE:
    places = [("../datasets/2019-1 BrasiliaBlocoC.csv", 1),
    ("../datasets/2019-1 BrasiliaBlocoK.csv", 2),
    ("../datasets/2019-1 Canoinhas.csv", 3),
    ("../datasets/2019-1 Continente.csv", 4),
    ("../datasets/2019-1 Fpolis.csv", 5),
    ("../datasets/2019-1 Lages.csv", 6),
    ("../datasets/2019-1 Reitoria.csv", 7)]

    merged = pd.DataFrame()
    for place in places:
        raw = pd.read_csv (place[0], sep=',')
        raw = raw.drop('momento', axis=1)
        raw ['place'] = place[1]
        merged = pd.concat([merged, raw], ignore_index=True)
        #print(raw)
    merged = merged.dropna()
    merged = merged[merged['pa']<250]
else:
    merged = pd.read_csv ("../datasets/several_places.csv", sep=',')

In [None]:
def plot_cm(y_true, y_pred, figsize=(10,10)):
    cm = metrics.confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)

    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax, linewidths=.5)

# EDA and pre processing

In [None]:
merged.describe()

In [None]:
X = merged.drop(['uid', 'momento'], axis=1)
y = merged['uid']

# Normalization
normalized = (X-X.min())/(X.max()-X.min())

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(10,6))
ax1.scatter(X['fi_3'], X['p3'],c=y, cmap='rainbow', alpha=0.01)
ax2.scatter(X['voltage'], X['p3'],c=y, cmap='rainbow', alpha=0.01)

In [None]:
y.tail()

# PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(normalized)

x_pca = pca.transform(normalized)
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=y,cmap='plasma', alpha=0.1)
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')

# K means

In [None]:
from sklearn.cluster import KMeans

In [None]:
sse = {}
for k in range(1, 10):
    model = KMeans(n_clusters=k, max_iter=1000).fit(normalized)
    #data["clusters"] = model.labels_
    #print(data["clusters"])
    sse[k] = model.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
model1 = KMeans(n_clusters=4)
model1.fit(normalized)

In [None]:
plot_cm(y, model1.labels_)
print(metrics.classification_report(y, model1.labels_))

In [None]:
fig, axes = plt.subplots(1,2,figsize=(14,5))


axes[0].scatter(x_pca[:,0],x_pca[:,1],c=y,cmap='plasma', alpha=0.1)
axes[0].set_xlabel('First principal component')
axes[0].set_ylabel('Second Principal Component')
axes[0].set_title("Cor = câmpus (original)")

axes[1].scatter(x_pca[:,0],x_pca[:,1],c=model1.labels_,cmap='plasma', alpha=0.1)
axes[1].set_xlabel('First principal component')
axes[1].set_ylabel('Second Principal Component')
axes[1].set_title("Cor = resultado do clustering")

# Optics

In [None]:
from sklearn.cluster import OPTICS

In [None]:
model2 = OPTICS(min_cluster_size=30, cluster_method='dbscan')
model2.fit(normalized)

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True,figsize=(10,6))
ax1.scatter(X['temp_celsius'], X['pa'],c=model2.labels_, cmap='rainbow', alpha=0.01)
ax2.scatter(X['pressao'], X['pa'],c=model2.labels_, cmap='rainbow', alpha=0.01)


In [None]:

unique, counts = np.unique(model2.labels_, return_counts=True)
dict(zip(unique, counts))

In [None]:
for cluster in unique:
    print(merged[model2.labels_==cluster].describe())
    
    
res = []    
for cluster in ???:
    y_hat = y[model2.labels_==cluster]
    #res.append([y_hat.count(), y_hat.min(), ...]) ##add row
    res.append (pd.DataFrame(y_hat).describe().values)