In [1]:
pip install fuzzywuzzy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import re
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from fuzzywuzzy import fuzz
from nltk.corpus import wordnet
from collections import defaultdict

In [3]:
df = pd.read_csv('mdc-1.csv')
df.tail(10) #display last 10 rows in data

Unnamed: 0,id,activity_id,name
3426,3192802,MDC1-10606,COLO-2 Cell-4 UPS PNL01 Compass issues Prevent...
3427,3194335,MDC1.C1.Cx3020.1,Additional Go-Back loadbanking (ASCO)
3428,3192808,MDC1-10607,6.1-98 Admin/COLO-1 ASCO EPMS Connectivity Issues
3429,3192809,MDC1-10608,6.1-88 Admin/ COLO-1 MER/ADMIN UPS L3 Testing...
3430,3192810,MDC1-10609,6.1-106 COLO-1 Cell-2 UPS01 Power module failure
3431,3192812,MDC1-10610,COLO-2 Cell-1 Open Issues preventing Yellow ta...
3432,3192818,MDC1-10611,COLO-2 Cell-2 Open Issues preventing Yellow ta...
3433,3192813,MDC1-10612,COLO-2 Cell-3 Open Issues preventing Yellow ta...
3434,3192814,MDC1-10613,COLO-2 Cell-4 Open Issues preventing Yellow ta...
3435,3192816,MDC1-10614,COLO-3 Delay in SE uploading QC Checkilst


In [4]:
df.head(10)

Unnamed: 0,id,activity_id,name
0,3191380,MDC1-UP#13,MDC1 Master Update #13- DD 8.31.2021Submitted
1,3191381,MDC1-UP#13.1,Executive Summary / Milestones
2,3192817,MDC1-UP#13.5,Construction
3,3191388,MDC1-UP#13.2,Construction Summary
4,3191673,MDC1-UP#13.4,Preconstruction
5,3191599,MDC1-UP#13.3,Preliminary Milestones
6,3192002,MDC1-UP#13.6,DELAY
7,3194559,MDC1-UP#13.7,VOID ACTIVITIES
8,3191393,MDC1-UP#13.8,CONTRACTUAL MILESTONES - DIV 1 SPECS
9,3192819,MDC1-UP#13.5.1,Site


In [5]:
df = df.drop(['id'], axis=1)
df.head()

Unnamed: 0,activity_id,name
0,MDC1-UP#13,MDC1 Master Update #13- DD 8.31.2021Submitted
1,MDC1-UP#13.1,Executive Summary / Milestones
2,MDC1-UP#13.5,Construction
3,MDC1-UP#13.2,Construction Summary
4,MDC1-UP#13.4,Preconstruction


In [6]:
def cluster_activities(data, threshold=70):
    """
    Clusters similar activities together using fuzzy string matching.

    Parameters:
        data (list): A list of strings representing the data to be clustered.
        threshold (int, optional): The minimum similarity ratio for two activities
            to be considered a match. Defaults to 80.

    Returns:
        A dictionary where each key is a cluster number and the corresponding value
        is a list of data items in that cluster.
    """
    clusters = defaultdict(list)
    cluster_num = 1

    for i, activity in enumerate(data):
        # Check if the activity is already in a cluster
        if any(activity in cluster for cluster in clusters.values()):
            continue

        # Create a new cluster for the activity
        clusters[cluster_num].append(activity)

        # Compare the activity to every other activity to find matches
        for j in range(i+1, len(data)):
            ratio = fuzz.ratio(activity, data[j])
            if ratio >= threshold:
                # Add the matching activity to the same cluster as the original activity
                clusters[cluster_num].append(data[j])

        # Check if the cluster meets the criteria for inclusion
        if len(clusters[cluster_num]) == 1 or len(set(clusters[cluster_num])) == 1:
            # If the cluster only has one item or all items are identical, remove it
            del clusters[cluster_num]
        else:
            # Increment the cluster number if the cluster is valid
            cluster_num += 1

    return clusters


In [7]:
data = df["name"]

clusters = cluster_activities(data)

# Print the non-trivial clusters
for cluster_num, cluster in clusters.items():
    if len(cluster) > 1:
        print(f"Cluster#{cluster_num}:")
        for item in cluster:
            print(item)
        print()


Cluster#1:
Construction
Construction Summary
Preconstruction

Cluster#2:
Electrical Procurement
Mechanical Procurement
Electrical OFCI Procurement

Cluster#3:
General
Generator
Generator

Cluster#4:
Admin/COLO1
Admin/ COLO 1
Admin/COLO 1
Admin/ COLO-1
Admin/COLO-1
IST Admin/COLO 1
TCO Admin/COLO 1

Cluster#5:
COLO 2
COLO 3
COLO 4
COLO-2
COLO 2
COLO 3
COLO 4
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO 2
COLO 3
COLO 4
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
COLO-2
IST COLO 2
TCO COLO 2
TCO COLO 2

Cluster#6:
Core & Shell Procurement
Wire Mesh Procurement

Cluster#7:
Update #1
Update #2
Update #3
Update#4
Update#6
Update#7
Update#8
Update#10
Update#12

Cluster#8:
ADMIN/ COLO-1
ADMIN/COLO-1

Cluster#9:
COLO-3
COLO-4
COLO-1
COLO 3
COLO-3 Cx
COLO-2
COLO-3
COLO-4
COLO-1
COLO-2
COLO-3
COLO-4
COLO-1
COLO-2
COLO-3
COLO-4
COLO-1
COLO-2
COLO-3
COLO-4
COLO-1
COLO-2
COLO-3
COLO

In [11]:
# Apply the clustering function to the name column and create a dictionary of clusters
clusters = cluster_activities(df["name"])

# Create a new column in the DataFrame called "cluster" and set the default value to -1
df["cluster"] = -1

# Loop over each cluster number and cluster in the clusters dictionary
for cluster_num, cluster in clusters.items():
    # Loop over each item in the cluster and set the "cluster" column to the cluster number
    for item in cluster:
        df.loc[df["name"] == item, "cluster"] = f"Cluster #{cluster_num}"


In [12]:
# Replace -1 values in the cluster column with NaN
df["cluster"].replace(-1, np.nan, inplace=True)

# Replace cluster numbers with cluster labels
df["cluster"].replace(regex={r'^Cluster #(\d+)$': r'Cluster \1'}, inplace=True)

In [13]:
df.head(20)

Unnamed: 0,activity_id,name,cluster
0,MDC1-UP#13,MDC1 Master Update #13- DD 8.31.2021Submitted,
1,MDC1-UP#13.1,Executive Summary / Milestones,
2,MDC1-UP#13.5,Construction,Cluster 1
3,MDC1-UP#13.2,Construction Summary,Cluster 1
4,MDC1-UP#13.4,Preconstruction,Cluster 1
5,MDC1-UP#13.3,Preliminary Milestones,
6,MDC1-UP#13.6,DELAY,
7,MDC1-UP#13.7,VOID ACTIVITIES,
8,MDC1-UP#13.8,CONTRACTUAL MILESTONES - DIV 1 SPECS,
9,MDC1-UP#13.5.1,Site,


In [14]:
df.to_csv("fuzzy_clusters.csv")