In [20]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from scipy.stats import zscore
from kneed import KneeLocator
from feature_engine import transformation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data, true = make_blobs(
    n_samples=1000, 
    n_features=60,
    centers=5,
    shuffle=True,
    center_box=(0, 5),
    cluster_std=4,
    random_state=1
    )

df = pd.DataFrame(data);

def transform_column(col=pd.Series()) -> pd.Series:
    min = col.min()
    col = col.apply(lambda x : x - min)
    return col

# Form data to format we want
df = df.round(0)
df = df.apply(lambda x : transform_column(x))

# Removing outliers
org_df = df.copy()
df = df[(np.abs(zscore(df)) <= 3).all(axis=1)]
temp = pd.merge(org_df, df, how="outer", suffixes=("", "_y"), indicator=True)
temp = temp[temp['_merge']=='left_only'][org_df.columns].index.array
true = np.delete(true, temp)

# Yeojohnson transformation for each column
transformer = transformation.YeoJohnsonTransformer()
df = transformer.fit_transform(df)

# Standard scaling for all values
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df))

kmeans_kwargs = {
    "init": "k-means++",
    "n_init": 10,
    "max_iter": 300,
    "n_clusters": 5,
}

# Doing the clustering
kmeans = KMeans(**kmeans_kwargs)
res = kmeans.fit(df).labels_
centers = kmeans.cluster_centers_

clust_cnt = kmeans_kwargs['n_clusters']
clust_tag_avgs = []
for i in range(0, clust_cnt):
    idxs = np.where(res == i)[0]
    usr_count = len(idxs)
    tot_usr_count = len(res)
    # If we use the original df (raw counts) it becomes skewed 
    tag_avgs = df.iloc[idxs].sum().apply(lambda x: x / usr_count)
    clust_tag_avgs.append(tag_avgs)

clust_tag_weights = []
for i in range(0, clust_cnt):
    min = np.min(clust_tag_avgs[i])
    temp = clust_tag_avgs[i].apply(lambda x: x - min)
    sum = temp.sum()
    temp = temp.apply(lambda x: x / sum)
    clust_tag_weights.append(temp)

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from feature_engine.transformation import YeoJohnsonTransformer
from scipy.stats import zscore

import pickle as pkl
import pandas as pd
import numpy as np

import json

preprocessor = Pipeline(
    [
        ("transformer", YeoJohnsonTransformer()),
        ("scaler", StandardScaler())
    ]
)

kmeans = Pipeline(
    [
        ("kmeans", 
            KMeans(
                init="k-means++",
                n_init=10,
                max_iter=300,
                n_clusters=5,
            ) 
        )
    ]
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clusterer", kmeans)
    ]
)

data, true = make_blobs(
    n_samples=1000, 
    n_features=61,
    centers=5,
    shuffle=True,
    center_box=(0, 5),
    cluster_std=4,
    random_state=1
    )

df = pd.DataFrame(data);

def transform_column(col=pd.Series()) -> pd.Series:
    min = col.min()
    col = col.apply(lambda x : x - min)
    return col

# Form data to format we want
df = df.round(0)
df = df.apply(lambda x : transform_column(x))

# Removing outliers
df = df[(np.abs(zscore(df)) <= 3).all(axis=1)]

# Prepare it. Here DF is a df of users against their total tag counts
pipeline.fit(df)

# Save it to a file
file = open('model.pkl', "wb")
pkl.dump(pipeline, file)
file.close()

# Get important additional data
labels = pipeline['clusterer']['kmeans'].labels_
clust_tag_avgs = []
for i in range(0, 5):
    idxs = np.where(labels == i)[0]
    usr_count = len(idxs)
    tot_usr_count = len(labels)
    # If we use the original df (raw counts) it becomes skewed 
    tag_avgs = df.iloc[idxs].sum().apply(lambda x: x / usr_count)
    clust_tag_avgs.append(tag_avgs)

clust_tag_weights = []
for i in range(0, 5):
    min = np.min(clust_tag_avgs[i])
    temp = clust_tag_avgs[i].apply(lambda x: x - min)
    sum = temp.sum()
    temp = temp.apply(lambda x: x / sum)
    clust_tag_weights.append(temp)

clust_data = {
    'cluster_avgs_': clust_tag_avgs,
    'cluster_weights_': clust_tag_weights
}

# Save important additional data
file = open('data.pkl', 'wb')
pkl.dump(clust_data, file)
file.close()
