In [None]:
import kneed
import numpy as np
import pandas as pd
import sklearn.preprocessing
from matplotlib import pyplot as plt
from sklearn import cluster

In [None]:
data = pd.read_csv('data/Country-data.csv')
data

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
gdp_data = data[data.gdpp < data.gdpp.quantile(0.9)]
gdp_data

In [None]:
columns = {
    'child_mort': False,
    'exports': True,
    'health': True,
    'imports': True,
    'inflation': False,
    'life_expec': True,
    'total_fer': True,
    'gdpp': True
}
gdp_without_country = gdp_data.drop(["country"], axis=1)
scaler = sklearn.preprocessing.MinMaxScaler()
gdp_data_normalized = scaler.fit_transform(gdp_without_country)
gdp_data_normalized = pd.DataFrame(gdp_data_normalized, columns=gdp_without_country.columns,
                                   index=gdp_without_country.index)
for column in columns:
    if not columns[column]:
        gdp_data_normalized[column] = gdp_data_normalized[column].apply(lambda x: 1 - x)

gdp_data_normalized

In [None]:
income_gdpp = gdp_data_normalized[["income", "gdpp", "child_mort"]]
income_gdpp

In [None]:
income_gdpp_sort = income_gdpp.sort_values(by=["income", "gdpp"])
df_distance = pd.DataFrame(columns=["index", "distance"])
for i in range(0, len(income_gdpp_sort) - 1):
    dist = np.linalg.norm(income_gdpp_sort.iloc[i] - income_gdpp_sort.iloc[i + 1])
    df_distance = df_distance.append({'index': str(i), 'distance': dist}, ignore_index=True)
df_distance = df_distance.sort_values(by=["distance"])
plt.scatter(df_distance["index"], df_distance["distance"])
plt.show()

In [None]:
df_distance2 = df_distance[df_distance['distance'] < 0.25]
df_distance2 = df_distance2.sort_values(by=["distance"])
plt.scatter(df_distance2["index"], df_distance2["distance"])
plt.show()

In [None]:
range = (0.05 + x * 0.05 for x in range(5))
for i in range:
    db = sklearn.cluster.DBSCAN(eps=i, min_samples=5).fit(income_gdpp)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    print(set(labels))
    silhoutte_avg = sklearn.metrics.silhouette_score(income_gdpp, labels)
    print(f"For eps={i} average silhouette score is {silhoutte_avg}")

In [None]:
data_dbscaning = sklearn.cluster.DBSCAN(eps=0.2, min_samples=20).fit_predict(income_gdpp)
plt.scatter(income_gdpp["income"], income_gdpp["gdpp"], c=data_dbscaning,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["child_mort"], income_gdpp["gdpp"], c=data_dbscaning,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["income"], income_gdpp["child_mort"], c=data_dbscaning,
            s=50, cmap='viridis')
plt.show()

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}


SSE_values = []
for k in range(1, 11):
    kmeans = cluster.KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(income_gdpp)
    SSE_values.append(kmeans.inertia_)

plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), SSE_values)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kl = kneed.KneeLocator(range(1, 11), SSE_values, curve="convex", direction="decreasing")
clusters_num = kl.elbow
clusters_num

In [None]:
gdp_kmeans_labels = cluster.KMeans(clusters_num).fit_predict(income_gdpp)
plt.scatter(income_gdpp["income"], income_gdpp["gdpp"], c=gdp_kmeans_labels,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["child_mort"], income_gdpp["gdpp"], c=gdp_kmeans_labels,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["income"], income_gdpp["child_mort"], c=gdp_kmeans_labels,
            s=50, cmap='viridis')
plt.show()

In [None]:
gdp_spectral_labels = cluster.SpectralClustering(n_clusters=clusters_num, affinity="nearest_neighbors", assign_labels="kmeans").fit_predict(income_gdpp)
plt.scatter(income_gdpp["income"], income_gdpp["gdpp"], c=gdp_spectral_labels,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["child_mort"], income_gdpp["gdpp"], c=gdp_spectral_labels,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["income"], income_gdpp["child_mort"], c=gdp_spectral_labels,
            s=50, cmap='viridis')
plt.show()

In [None]:
gdp_mini_kmeans_labels = cluster.MiniBatchKMeans(clusters_num).fit_predict(income_gdpp)
plt.scatter(income_gdpp["income"], income_gdpp["gdpp"], c=gdp_mini_kmeans_labels,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["child_mort"], income_gdpp["gdpp"], c=gdp_mini_kmeans_labels,
            s=50, cmap='viridis')
plt.show()
plt.scatter(income_gdpp["income"], income_gdpp["child_mort"], c=gdp_mini_kmeans_labels,
            s=50, cmap='viridis')
plt.show()