In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Best feature combination

### We have 13 relevant columns in our dataframe that we will test

In [184]:
df = pd.read_csv("./results/df_to_play.csv")

df.head()

Unnamed: 0,Age,Interests,Shower,WashingMachine,DishWasher,EcoMode,WaterRecycling,GeneralRecycling,Meat,Gasoline,Pet,Garden,Estimation
0,Old,0,1,1,1,1,1,0,1,1,No,No,No
1,Old,0,0,0,1,0,1,1,1,1,No,Yes,No
2,Old,2,0,0,2,2,1,1,0,1,No,No,No
3,Young,0,1,0,2,0,1,1,1,1,No,No,No
4,Old,0,1,0,2,1,0,0,1,0,No,Yes,Yes


### We want to get all the possible dataframe combinations having at least 5 features.

In [185]:
from itertools import combinations

# Minimum number of columns in each combination
min_columns = 5

# Get all combinations of column names with at least min_columns
column_combinations = []
for r in range(min_columns, len(df.columns) + 1):
    column_combinations.extend(combinations(df.columns, r))

# Create new DataFrames for each column combination
new_dataframes = []
for columns in column_combinations:
    new_df = df[list(columns)]
    new_dataframes.append(new_df)

# Number of possible dataframes
len(new_dataframes)


# We end up with 7099 different dataframes !!!

7099

In [186]:
new_dataframes[0]

Unnamed: 0,Age,Interests,Shower,WashingMachine,DishWasher
0,Old,0,1,1,1
1,Old,0,0,0,1
2,Old,2,0,0,2
3,Young,0,1,0,2
4,Old,0,1,0,2
...,...,...,...,...,...
1174,Old,2,0,2,1
1175,Young,1,2,0,1
1176,Young,2,1,1,2
1177,Old,2,2,1,2


In [187]:
new_dataframes[7098]

Unnamed: 0,Age,Interests,Shower,WashingMachine,DishWasher,EcoMode,WaterRecycling,GeneralRecycling,Meat,Gasoline,Pet,Garden,Estimation
0,Old,0,1,1,1,1,1,0,1,1,No,No,No
1,Old,0,0,0,1,0,1,1,1,1,No,Yes,No
2,Old,2,0,0,2,2,1,1,0,1,No,No,No
3,Young,0,1,0,2,0,1,1,1,1,No,No,No
4,Old,0,1,0,2,1,0,0,1,0,No,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174,Old,2,0,2,1,2,1,2,0,1,No,No,Yes
1175,Young,1,2,0,1,1,0,1,1,1,Yes,No,Yes
1176,Young,2,1,1,2,1,1,0,1,1,No,No,Yes
1177,Old,2,2,1,2,1,1,1,1,0,No,No,Yes


### Check number of combinations

In [188]:
#  C(n, r) = n! / (r! * (n-r)!)
# "n" represents the total number of items
# "r" represents the number of items taken at a time.

import math

def combinations(n,r):

    return (math.factorial(n)) / ((math.factorial(r))*(math.factorial(n-r)))

print(combinations(4,3))
print(combinations(4,2))

4.0
6.0


### Lets one-hot encode every categorical column ( Age , Estimation , Pet , Garden )

In [189]:
one_hot_encoded_dataframes = []


# Loop through each DataFrame
for df in new_dataframes:


    if "Age" not in df.columns and "Estimation" not in df.columns and "Pet" not in df.columns and "Garden" not in df.columns:

        one_hot_encoded_dataframes.append(df)
    else:

        # 4 combinations
        if "Age" in df.columns and "Estimation" in df.columns and "Pet" in df.columns and "Garden" in df.columns:
            # One-hot encode "Age" column
            df = pd.get_dummies(df, columns=["Age","Estimation","Pet","Garden"])
            one_hot_encoded_dataframes.append(df)

        # 3 combinations
        elif "Age" in df.columns and "Estimation" in df.columns and "Pet" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Age","Estimation","Pet"])
            one_hot_encoded_dataframes.append(df)

        elif "Age" in df.columns and "Estimation" in df.columns and "Garden" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Age","Estimation","Garden"])
            one_hot_encoded_dataframes.append(df)

        elif "Estimation" in df.columns and "Pet" in df.columns and "Garden" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Estimation","Pet","Garden"])
            one_hot_encoded_dataframes.append(df)
        
        elif "Pet" in df.columns and "Garden" in df.columns and "Age" in df.columns:
            # One-hot encode "Pet" column
            df = pd.get_dummies(df, columns=["Pet","Garden","Age"])
            one_hot_encoded_dataframes.append(df)


        # 2 combinations
        elif "Age" in df.columns and "Estimation" in df.columns:
            # One-hot encode "Age" column
            df = pd.get_dummies(df, columns=["Age","Estimation"])
            one_hot_encoded_dataframes.append(df) 
    
        elif "Age" in df.columns and "Pet" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Age","Pet"])
            one_hot_encoded_dataframes.append(df) 

        elif "Age" in df.columns and "Garden" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Age","Garden"])
            one_hot_encoded_dataframes.append(df) 

        elif "Estimation" in df.columns and "Pet" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Estimation","Pet"])
            one_hot_encoded_dataframes.append(df) 

        elif "Estimation" in df.columns and "Garden" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Estimation","Garden"])
            one_hot_encoded_dataframes.append(df) 

        elif "Pet" in df.columns and "Garden" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Pet","Garden"])
            one_hot_encoded_dataframes.append(df) 

        # 1 combination
        elif "Age" in df.columns:
            # One-hot encode "Age" column
            df = pd.get_dummies(df, columns=["Age"])
            one_hot_encoded_dataframes.append(df) 
    
        elif "Estimation" in df.columns:
            # One-hot encode "Estimation" column
            df = pd.get_dummies(df, columns=["Estimation"])
            one_hot_encoded_dataframes.append(df) 

        elif "Pet" in df.columns:
            # One-hot encode "Pet" column
            df = pd.get_dummies(df, columns=["Pet"])
            one_hot_encoded_dataframes.append(df) 
        elif "Garden" in df.columns:
            # One-hot encode "Garden" column
            df = pd.get_dummies(df, columns=["Garden"])
            one_hot_encoded_dataframes.append(df)


# Number of possible dataframes
len(one_hot_encoded_dataframes)


7099

In [190]:
new_dataframes[0]

Unnamed: 0,Age,Interests,Shower,WashingMachine,DishWasher
0,Old,0,1,1,1
1,Old,0,0,0,1
2,Old,2,0,0,2
3,Young,0,1,0,2
4,Old,0,1,0,2
...,...,...,...,...,...
1174,Old,2,0,2,1
1175,Young,1,2,0,1
1176,Young,2,1,1,2
1177,Old,2,2,1,2


In [191]:
one_hot_encoded_dataframes[0]

Unnamed: 0,Interests,Shower,WashingMachine,DishWasher,Age_Old,Age_Young
0,0,1,1,1,True,False
1,0,0,0,1,True,False
2,2,0,0,2,True,False
3,0,1,0,2,False,True
4,0,1,0,2,True,False
...,...,...,...,...,...,...
1174,2,0,2,1,True,False
1175,1,2,0,1,False,True
1176,2,1,1,2,False,True
1177,2,2,1,2,True,False


In [192]:
# Transform column type for k-means
for i in one_hot_encoded_dataframes:
    column_name = i.select_dtypes(include=['uint8',"object"]).columns
    i[column_name] = i[column_name].astype('int64')



### We are going to get inertia and silhouette scores of all the 7,099 dataframe combinations and append it to a new one to get the columns with the best scores.

**After 71 minutes the process is complete!**

In [193]:
import pandas as pd
from sklearn.cluster import KMeans

dfs = []

for iteration, i in enumerate(one_hot_encoded_dataframes, 1):
    k_values = range(2, 10)

    kmeans_per_k = [KMeans(n_clusters=k, n_init=10, random_state=42).fit(i) for k in range(2, 10)]
    inertias = [model.inertia_ for model in kmeans_per_k]
    silhouette_scores = [silhouette_score(i, model.labels_) for model in kmeans_per_k]

    # Create a dataframe to store the k-values and inertia values
    data = {'Iteration': [iteration] * len(k_values), 'K-Value': k_values, 'Inertia': inertias, 'Silhouette': silhouette_scores}
    df_inertias = pd.DataFrame(data)
    dfs.append(df_inertias)

# Concatenate all the dataframes
df_combined = pd.concat(dfs, ignore_index=True)
df_combined

Unnamed: 0,Iteration,K-Value,Inertia,Silhouette
0,1,2,2219.561492,0.305796
1,1,3,1823.620850,0.277740
2,1,4,1553.773016,0.262619
3,1,5,1404.895459,0.276512
4,1,6,1309.783840,0.263626
...,...,...,...,...
56787,7099,5,4296.454532,0.101761
56788,7099,6,4128.056788,0.108583
56789,7099,7,3998.785864,0.105336
56790,7099,8,3880.837500,0.103280


In [194]:
df_combined.to_csv(r'./results/column_selection.csv', index=False)

In [196]:
all = pd.read_csv("./results/column_selection.csv")

all

Unnamed: 0,Iteration,K-Value,Inertia,Silhouette
0,1,2,2219.561492,0.305796
1,1,3,1823.620850,0.277740
2,1,4,1553.773016,0.262619
3,1,5,1404.895459,0.276512
4,1,6,1309.783840,0.263626
...,...,...,...,...
56787,7099,5,4296.454532,0.101761
56788,7099,6,4128.056788,0.108583
56789,7099,7,3998.785864,0.105336
56790,7099,8,3880.837500,0.103280


In [216]:
# Assuming you have a DataFrame named df
df_sorted = all.sort_values(by='Silhouette',ascending=False)

filtered_df = df_sorted.loc[df_sorted['K-Value'] == 2]
filtered_df

Unnamed: 0,Iteration,K-Value,Inertia,Silhouette
6320,791,2,1143.148272,0.432246
6488,812,2,1251.715181,0.426601
1160,146,2,1362.117168,0.414630
6496,813,2,1330.230645,0.414256
6336,793,2,1324.956472,0.408250
...,...,...,...,...
55984,6999,2,3628.314176,0.124680
53400,6676,2,3772.601428,0.122222
56016,7003,2,3883.372723,0.116194
56000,7001,2,3988.074476,0.115681


In [217]:
one_hot_encoded_dataframes[792]

Unnamed: 0,Interests,WaterRecycling,GeneralRecycling,Meat,Garden_No,Garden_Yes
0,0,1,0,1,True,False
1,0,1,1,1,False,True
2,2,1,1,0,True,False
3,0,1,1,1,True,False
4,0,0,0,1,False,True
...,...,...,...,...,...,...
1174,2,1,2,0,True,False
1175,1,0,1,1,True,False
1176,2,1,0,1,True,False
1177,2,1,1,1,True,False


In [210]:
df_sorted

Unnamed: 0,Iteration,K-Value,Inertia,Silhouette
10295,1287,9,287.049506,0.705906
3943,493,9,290.643187,0.704877
3927,491,9,294.023652,0.679837
3951,494,9,395.899346,0.663830
10231,1279,9,265.322195,0.662635
...,...,...,...,...
56131,7017,5,3906.768738,0.100331
56791,7099,9,3782.089533,0.099755
56783,7098,9,3338.597721,0.099585
56550,7069,8,3091.324906,0.099274


In [215]:


# df_sorted.to_excel(r'./results/column_selection_excel.xlsx',sheet_name='home', index=False)
filtered_df = df_sorted.loc[df_sorted['K-Value'] == 2]
filtered_df

Unnamed: 0,Iteration,K-Value,Inertia,Silhouette
6320,791,2,1143.148272,0.432246
6488,812,2,1251.715181,0.426601
1160,146,2,1362.117168,0.414630
6496,813,2,1330.230645,0.414256
6336,793,2,1324.956472,0.408250
...,...,...,...,...
55984,6999,2,3628.314176,0.124680
53400,6676,2,3772.601428,0.122222
56016,7003,2,3883.372723,0.116194
56000,7001,2,3988.074476,0.115681
