In [1]:
import pandas as pd
pd.set_option('display.max_rows', 300)
import rapidfuzz
from joblib import Parallel, delayed
from tqdm import tqdm
import re
import awswrangler as wr


In [2]:
def pre_match_to_similar_text(text,
                              list_of_target_text,
                              similarity_threshold) -> str:
    
    result = rapidfuzz.process.extract(text,
                                       list_of_target_text,
                                       scorer=rapidfuzz.fuzz.token_set_ratio,
                                       limit=1,
                                       score_cutoff=similarity_threshold)

    return result[0][0] if (len(result) > 0) else 'none'


def pre_grouping_text(df,
                      column_string,
                      percent_threshold,
                      similarity_threshold,
                      new_column_name):
 

    if new_column_name is None:
        new_column_name = column_string

    threshold_to_be_major = percent_threshold / 100 * len(df[column_string].unique())


    t = df[column_string].value_counts().to_frame()
    major_group_list = df[df[column_string].isin(t[t[column_string] > threshold_to_be_major].index)][column_string].unique()
    minor_group_list = df[~df[column_string].isin(major_group_list)][column_string].tolist()

    grouping_dictionary = {major_group_list[i]: major_group_list[i] for i in range(len(major_group_list))}

    result = Parallel(n_jobs=10)(delayed(pre_match_to_similar_text)(
        minor_group, major_group_list, similarity_threshold) for minor_group in tqdm(minor_group_list))
    temp = dict(zip(minor_group_list, result))

    grouping_dictionary.update(temp)
    df[new_column_name] = df[column_string].map(grouping_dictionary)

    return df


def pre_remove_position(text: str) -> str:
  
    for direction in ['lh', 'rh', 'right', 'left', 'upper']:
        text = text.lower()
        text = re.sub(direction, '', text)

    return text.strip()

In [4]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"
bucket = "scope_part"

partDf = wr.s3.read_parquet(f"s3://{bucket}/")
partDf

Unnamed: 0,CaseID,Description,fVAL,DamageCond,CoType
0,12657878,bonnet,550.0,bent/dented,I
1,12657878,bumper front cover,350.0,creased/torn,I
2,12657878,bumper front lower grille,110.0,bracket torn,I
3,12657878,bumper reinforcement front,250.0,bent/dented,I
4,12657878,fog lamp front rh,150.0,bracket snapped,I
...,...,...,...,...,...
12453795,12908158,fuse / relay box,300.0,broken,I
12453796,12908158,airbag rotary coupling,500.0,activated,I
12453797,12908162,windscreen glass - front,309.2,,I
12453798,12908163,tint film,280.0,,I


In [5]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"
bucket2 = "scope_case"

caseDf = wr.s3.read_parquet(f"s3://{bucket2}/", columns=["CaseID"])

In [6]:
partDf = partDf.merge(caseDf, on="CaseID")

In [7]:
# partDf["Description"].value_counts().head(100).reset_index()

In [8]:
fuzzyMatchDf = pre_grouping_text(partDf,
                         column_string='Description',
                         percent_threshold=3,
                         similarity_threshold=60,
                         new_column_name='FuzzyMatchDesc')

100%|██████████| 2831751/2831751 [00:58<00:00, 48010.00it/s]


In [9]:
fuzzyMatchDf["FuzzyMatchDesc"].value_counts().to_frame()["FuzzyMatchDesc"].sum()

7235548

In [10]:
fuzzyMatchDf["FuzzyMatchDesc"].value_counts().to_frame().head(50)["FuzzyMatchDesc"].sum()

5584984

In [11]:
mappingDf = fuzzyMatchDf["FuzzyMatchDesc"].value_counts().reset_index().rename(columns={"index" : "part", "FuzzyMatchDesc" : "count"})

In [12]:
fuzzyMatchDf[fuzzyMatchDf["FuzzyMatchDesc"] == "none"]["Description"].value_counts().reset_index().head(20)

Unnamed: 0,index,Description
0,tint film,8702
1,oil cooler gearbox,7576
2,bodyside rh,5909
3,sealant,5497
4,pas oil cooler,5175
5,bodyside lh,5006
6,pas pump,4505
7,horn,4337
8,side step lh,4255
9,fuse box,4183


In [13]:
mappingDf.to_csv("../../data/tmp/mapping.csv")

In [89]:
semanticGroupDf = pd.read_csv("../../data/tmp/complete_mapping.csv")

In [95]:
semanticGroupDf["lvl_2_desc"] = semanticGroupDf["lvl_2_desc"].str.strip()
semanticGroupDf["lvl_3_desc"] = semanticGroupDf["lvl_3_desc"].str.strip()

lvl2LabelDistribDf = semanticGroupDf.groupby('lvl_2_desc')["count"].sum().reset_index()
lvl3LabelDistribDf = semanticGroupDf.groupby('lvl_3_desc')["count"].sum().reset_index()


In [96]:
len(lvl2LabelDistribDf)

36

In [97]:
len(lvl3LabelDistribDf)

27

In [98]:
import plotly.express as px
px.bar(lvl2LabelDistribDf, x="lvl_2_desc", y="count")

In [99]:
px.bar(lvl3LabelDistribDf, x="lvl_3_desc", y="count")