In [30]:
import pandas as pd
pd.set_option('display.max_rows', 300)
import rapidfuzz
from joblib import Parallel, delayed
from tqdm import tqdm
import re
import awswrangler as wr
import boto3

In [31]:
def pre_match_to_similar_text(text,
                              list_of_target_text,
                              similarity_threshold) -> str:
    
    result = rapidfuzz.process.extract(text,
                                       list_of_target_text,
                                       scorer=rapidfuzz.fuzz.token_set_ratio,
                                       limit=1,
                                       score_cutoff=similarity_threshold)

    return result[0][0] if (len(result) > 0) else 'none'


def pre_grouping_text(df,
                      column_string,
                      percent_threshold,
                      similarity_threshold,
                      new_column_name):
 

    if new_column_name is None:
        new_column_name = column_string

    threshold_to_be_major = percent_threshold / 100 * len(df[column_string].unique())


    t = df[column_string].value_counts().to_frame()
    major_group_list = df[df[column_string].isin(t[t[column_string] > threshold_to_be_major].index)][column_string].unique()
    minor_group_list = df[~df[column_string].isin(major_group_list)][column_string].tolist()

    grouping_dictionary = {major_group_list[i]: major_group_list[i] for i in range(len(major_group_list))}

    result = Parallel(n_jobs=10)(delayed(pre_match_to_similar_text)(
        minor_group, major_group_list, similarity_threshold) for minor_group in tqdm(minor_group_list))
    temp = dict(zip(minor_group_list, result))

    grouping_dictionary.update(temp)
    df[new_column_name] = df[column_string].map(grouping_dictionary)

    return df


def pre_remove_position(text: str) -> str:
  
    for direction in ['lh', 'rh', 'right', 'left', 'upper']:
        text = text.lower()
        text = re.sub(direction, '', text)

    return text.strip()

In [32]:
wr.config.s3_endpoint_url = "http://192.168.1.4:8333"
bucket = "scope_part"

partDf = wr.s3.read_parquet(f"s3://{bucket}/")
partDf

  8%|▊         | 217817/2831751 [00:20<00:31, 82419.70it/s]

Unnamed: 0,CaseID,Description,fVAL,DamageCond,CoType
0,12657878,bonnet,550.0,bent/dented,I
1,12657878,bumper front cover,350.0,creased/torn,I
2,12657878,bumper front lower grille,110.0,bracket torn,I
3,12657878,bumper reinforcement front,250.0,bent/dented,I
4,12657878,fog lamp front rh,150.0,bracket snapped,I
...,...,...,...,...,...
12453795,12908158,fuse / relay box,300.0,broken,I
12453796,12908158,airbag rotary coupling,500.0,activated,I
12453797,12908162,windscreen glass - front,309.2,,I
12453798,12908163,tint film,280.0,,I


In [33]:
wr.config.s3_endpoint_url = "http://192.168.1.7:8333"
bucket2 = "scope_case"

caseDf = wr.s3.read_parquet(f"s3://{bucket2}/", columns=["CaseID"])

In [34]:
partDf = partDf.merge(caseDf, on="CaseID")

In [35]:
# partDf["Description"].value_counts().head(100).reset_index()

In [36]:
fuzzyMatchDf = pre_grouping_text(partDf,
                         column_string='Description',
                         percent_threshold=3,
                         similarity_threshold=60,
                         new_column_name='FuzzyMatchDesc')



In [None]:
fuzzyMatchDf["FuzzyMatchDesc"].value_counts().to_frame()["FuzzyMatchDesc"].sum()

7235548

In [None]:
fuzzyMatchDf["FuzzyMatchDesc"].value_counts().to_frame().head(50)["FuzzyMatchDesc"].sum()

5584984

In [None]:
mappingDf = fuzzyMatchDf["FuzzyMatchDesc"].value_counts().reset_index().rename(columns={"index" : "part", "FuzzyMatchDesc" : "count"})

In [None]:
fuzzyMatchDf[fuzzyMatchDf["FuzzyMatchDesc"] == "none"]["Description"].value_counts().reset_index().head(20)

Unnamed: 0,index,Description
0,tint film,8702
1,oil cooler gearbox,7576
2,bodyside rh,5909
3,sealant,5497
4,pas oil cooler,5175
5,bodyside lh,5006
6,pas pump,4505
7,horn,4337
8,side step lh,4255
9,fuse box,4183


In [None]:
mappingDf.to_csv("../../data/tmp/mapping.csv")

In [None]:
semanticGroupDf = pd.read_csv("/home/alextay96/Desktop/new_workspace/DLDataPipeline/data/tmp/complete_view_mapping.csv")

In [None]:
semanticGroupDf

Unnamed: 0.1,Unnamed: 0,lvl_1_desc,count,lvl_2_desc,lvl_3_desc,front_view,front_view_left
0,0,none,496523,misc,misc,1,1.0
1,1,bumper front cover,353946,bumper front,bumper front,1,1.0
2,2,headlamp assy rh,281629,headlamp rh,headlamp rh,1,0.0
3,3,headlamp assy lh,279970,headlamp lh,headlamp lh,1,1.0
4,4,radiator,245848,radiator,engine,1,1.0
5,5,bonnet,233463,bonnet,bonnet,1,1.0
6,6,bumper rear cover,187056,bumper rear,bumper rear,0,0.0
7,7,air conditioning condenser,174417,air conditioning,engine,1,1.0
8,8,front bumper,149934,bumper front,bumper front,1,1.0
9,9,fender front rh,148560,fender front rh,fender front rh,0,0.0


In [None]:
semanticGroupDf["lvl_2_desc"] = semanticGroupDf["lvl_2_desc"].str.strip()
semanticGroupDf["lvl_3_desc"] = semanticGroupDf["lvl_3_desc"].str.strip()

lvl2LabelDistribDf = semanticGroupDf.groupby('lvl_2_desc')["count"].sum().reset_index()
lvl3LabelDistribDf = semanticGroupDf.groupby('lvl_3_desc')["count"].sum().reset_index()


In [None]:
len(lvl2LabelDistribDf)

36

In [None]:
len(lvl3LabelDistribDf)

28

In [None]:
import plotly.express as px
px.bar(lvl2LabelDistribDf, x="lvl_2_desc", y="count")

In [None]:
px.bar(lvl3LabelDistribDf, x="lvl_3_desc", y="count")

In [None]:
partlistLabelDf = pd.merge(left=fuzzyMatchDf,  left_on="FuzzyMatchDesc",right=semanticGroupDf,right_on="lvl_1_desc")
partlistLabelDf

Unnamed: 0.1,CaseID,Description,fVAL,DamageCond,CoType,FuzzyMatchDesc,Unnamed: 0,lvl_1_desc,count,lvl_2_desc,lvl_3_desc,front_view,front_view_left
0,12657878,bonnet,550.00,bent/dented,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
1,12657888,bonnet,510.44,bent,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
2,12657890,bonnet,504.94,dented,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
3,12657900,bonnet,450.00,crumpled,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
4,12657903,bonnet,280.00,bent,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7235543,12606480,rear panel inner trim,250.00,broken,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
7235544,12606779,firewall rear trim,225.00,,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
7235545,12607452,inner carrier rh,384.00,holder torn,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
7235546,12607489,taillamp inner lh n rh,691.00,broken,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,


In [None]:

cli = boto3.client("s3", **{
    "endpoint_url": "http://192.168.1.7:8333",
    "aws_access_key_id": "",
    "aws_secret_access_key": "",
    # "Username": "aaa",
})
outputBucketName = "partlist_label"
# cli.create_bucket(Bucket=outputBucketName)
wr.s3.to_parquet(
                    df=partlistLabelDf,
                    path=f"s3://{outputBucketName}/",
                    dataset=True,
                    mode="overwrite",
                )


{'paths': ['s3://partlist_label/faf7115c09794db8ac1520c440af1f36.snappy.parquet'],
 'partitions_values': {}}

In [None]:

wr.s3.read_parquet(
    path=f"s3://{outputBucketName}/"
)

Unnamed: 0,CaseID,Description,fVAL,DamageCond,CoType,FuzzyMatchDesc,Unnamed:_0,lvl_1_desc,count,lvl_2_desc,lvl_3_desc,front_view,front_view_left
0,12657878,bonnet,550.00,bent/dented,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
1,12657888,bonnet,510.44,bent,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
2,12657890,bonnet,504.94,dented,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
3,12657900,bonnet,450.00,crumpled,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
4,12657903,bonnet,280.00,bent,I,bonnet,5,bonnet,233463,bonnet,bonnet,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7235543,12606480,rear panel inner trim,250.00,broken,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
7235544,12606779,firewall rear trim,225.00,,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
7235545,12607452,inner carrier rh,384.00,holder torn,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
7235546,12607489,taillamp inner lh n rh,691.00,broken,I,rear panel inner trim,117,rear panel inner trim,12963,rear panel,rear panel,0,
