# Tworzymy model do klastrowania danych z ankiety powitalnej

In [1]:
import pandas as pd
from pycaret.clustering import setup, create_model, assign_model, plot_model, save_model, load_model, predict_model

In [15]:
df = pd.read_csv('welcome_survey_simple_v2.csv', sep=';')
df.head()

Unnamed: 0,age,edu_level,fav_animals,fav_place,gender
0,<18,Podstawowe,Brak ulubionych,,Kobieta
1,25-34,Średnie,Psy,Nad wodą,Mężczyzna
2,45-54,Wyższe,Psy,W lesie,Mężczyzna
3,35-44,Średnie,Koty,W górach,Mężczyzna
4,35-44,Wyższe,Psy,Nad wodą,Mężczyzna


In [16]:
s = setup(df, session_id=100)
s

Unnamed: 0,Description,Value
0,Session id,100
1,Original data shape,"(229, 5)"
2,Transformed data shape,"(229, 21)"
3,Categorical features,5
4,Rows with missing values,13.1%
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


<pycaret.clustering.oop.ClusteringExperiment at 0x16fcddf0510>

In [17]:
s.dataset.head()

Unnamed: 0,age,edu_level,fav_animals,fav_place,gender
0,<18,Podstawowe,Brak ulubionych,,Kobieta
1,25-34,Średnie,Psy,Nad wodą,Mężczyzna
2,45-54,Wyższe,Psy,W lesie,Mężczyzna
3,35-44,Średnie,Koty,W górach,Mężczyzna
4,35-44,Wyższe,Psy,Nad wodą,Mężczyzna


In [18]:
s.dataset_transformed.head()

Unnamed: 0,age_<18,age_25-34,age_45-54,age_35-44,age_18-24,age_>=65,age_55-64,age_unknown,edu_level_Podstawowe,edu_level_Średnie,...,fav_animals_Brak ulubionych,fav_animals_Psy,fav_animals_Koty,fav_animals_Inne,fav_animals_Koty i Psy,fav_place_Nad wodą,fav_place_W lesie,fav_place_W górach,fav_place_Inne,gender
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [19]:
kmeans = create_model('kmeans', num_clusters=8)

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2206,26.7592,1.7903,0,0,0


In [20]:
df_with_clusters = assign_model(kmeans)
df_with_clusters

Unnamed: 0,age,edu_level,fav_animals,fav_place,gender,Cluster
0,<18,Podstawowe,Brak ulubionych,,Kobieta,Cluster 0
1,25-34,Średnie,Psy,Nad wodą,Mężczyzna,Cluster 0
2,45-54,Wyższe,Psy,W lesie,Mężczyzna,Cluster 1
3,35-44,Średnie,Koty,W górach,Mężczyzna,Cluster 5
4,35-44,Wyższe,Psy,Nad wodą,Mężczyzna,Cluster 4
...,...,...,...,...,...,...
224,35-44,Wyższe,Koty,Inne,Kobieta,Cluster 6
225,45-54,Wyższe,Inne,W lesie,Mężczyzna,Cluster 1
226,25-34,Wyższe,Psy,W górach,Mężczyzna,Cluster 2
227,35-44,Wyższe,Brak ulubionych,W górach,Mężczyzna,Cluster 6


In [21]:
df_with_clusters["Cluster"].value_counts()

Cluster 4    41
Cluster 1    38
Cluster 3    32
Cluster 2    30
Cluster 5    28
Cluster 6    28
Cluster 0    22
Cluster 7    10
Name: Cluster, dtype: int64

In [22]:
plot_model(kmeans, plot='cluster')

In [23]:
save_model(kmeans, 'welcome_survey_clustering_pipeline_v2', verbose=False)

(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=[], transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['age', 'edu_level', 'fav_animals',
                                              'fav_place', 'gender'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('ordinal_encoding',
                  TransformerWrapper(include=['gender'],
                                     transfo...
                                                                mapping=[{'col': 'gender',
                                                                          'data_type': dtype('O'),
                                                                          'mapping': Kobieta      0
 Mężczyzna    1
 NaN         -1
 dtype: int64}]))),
                 ('onehot_encoding',
                  TransformerWrappe

In [24]:
kmeans_pipeline = load_model('welcome_survey_clustering_pipeline_v2')
kmeans_pipeline

Transformation Pipeline and Model Successfully Loaded


In [25]:
predict_df = pd.DataFrame([
    {
        "age": "45-54",  # '<18', '25-34', '45-54', '35-44', '18-24', '>=65', '55-64', 'unknown'
        "edu_level": 'Średnie',  # 'Podstawowe', 'Średnie', 'Wyższe'
        "fav_animals": 'Brak ulubionych',  # 'Brak ulubionych', 'Psy', 'Koty', 'Inne', 'Koty i Psy'
        "fav_place": 'W lesie',  # 'Nad wodą', 'W lesie', 'W górach', 'Inne'
        "gender": 'Kobieta',  # 'Mężczyzna', 'Kobieta'
    }
])

In [26]:
predict_with_clusters_df = predict_model(kmeans_pipeline, data=predict_df)
predict_with_clusters_df["Cluster"]

0    Cluster 5
Name: Cluster, dtype: object