In [None]:
!pip install fuzzy-c-means

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = pd.read_csv("../data/preprocessed_data.csv")

In [3]:
from skfuzzy import cmeans

In [4]:
def cluster_data(df, features=None, k_range=range(2, 11), max_samples=10000, verbose=True):
    
    # Sample of Data
    sample_size = min(len(df), max_samples)
    samples = df.sample(n=sample_size)  
    
    # Search for best K
    scores = []
    labels_dict = {}
    
    # Loop through k_range
    for k in k_range:
        
        # Fuzzy C-Means clustering
        cntr, u, u0, d, jm, p, fpc = cmeans(samples.values.T, k, 2, error=0.005, maxiter=1000, init=None)
        
        # Fuzzy labels (taking the highest membership value for each point)
        labels = np.argmax(u, axis=0)
        labels_dict[k] = labels
        
        # Silhouette score
        score = silhouette_score(samples, labels)
        scores.append(score)
    
    # Find optimal k
    best_k = k_range[scores.index(max(scores))]

    df_clustered = df.copy()

    # Add fuzzy cluster column to data
    df_clustered.loc[samples.index, 'fuzzy_cluster'] = labels_dict[best_k]
    
    # Get the best fuzzy model (for assigning the rest of the data)
    cntr_best, u_best, u0_best, d_best, jm_best, p_best, fpc_best = cmeans(samples.values.T, best_k, 2, error=0.005, maxiter=1000, init=None)
    
    if len(df_clustered) > sample_size:
        # Assign remaining points to nearest fuzzy cluster
        remaining_indices = df_clustered.index.difference(samples.index)
        remaining_data = df.loc[remaining_indices]
        
        # Predict fuzzy clusters for remaining points (based on the highest membership)
        remaining_u = np.argmax(cmeans(remaining_data.values.T, best_k, 2, error=0.005, maxiter=1000, init=None)[1], axis=0)
        df_clustered.loc[remaining_indices, 'fuzzy_cluster'] = remaining_u
    
    return df_clustered, best_k, scores

In [5]:
X = data.drop(columns=['Category'])  # Numerical Data
Y = data['Category']

df_clustered, best_k, scores = cluster_data(X)  

In [8]:
df_clustered["Category"]=Y
df_clustered.columns

Index(['DayOfWeek', 'PdDistrict', 'X', 'Y', 'Day', 'Month', 'Year', 'Hour',
       'Minute', 'Block', 'fuzzy_cluster', 'Category'],
      dtype='object')

In [10]:
df_clustered.shape

(878049, 12)

In [9]:
df_clustered.to_csv("../data/Fuzzy_data.csv", index=False)