In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import scipy
import gc
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import DistanceMetric
# from sklearn.neighbors import ja
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', -1)

  pd.set_option('max_colwidth', -1)


In [3]:
import re


class TextTransform(object):
    def __init__(self, split=" ", filters="!'#$%&()*+,.:;<=>?@[\\]/^_`{|}~\t\n", phrase_model=None, morpho=True,remove_digit=False,
                 remove_short=False):
        self.split = split
        self.filters = filters
        self.translate_map = str.maketrans(filters, split * len(filters))
        self.ALPHADIGIT = " alphadigit ".upper()
        self.DIGITALPHA = " digitalpha ".upper()
        self.DIGIT = " digit ".upper()
#         self.QUANTITY_FORMAT = "\b[ml|l|pcs|kg|mg|pack|inc|pc]\b"
        self.QUANTITY_FORMAT = r"(\d+)(ml|l|pcs|kg|mg|pack|inc|pc)"
        self.QUANTITY = " QUANTITY ".upper()
        self.ALPHA_ALPHADIGIT = " alpha-alphadigit ".upper()
        self.phraser = phrase_model
        self.morpho = morpho
        self.remove_short = remove_short
        self.remove_digit = remove_digit

    def transform(self, text, remove_short=False):
        """
            Split tokens on white space.
            Remove all punctuation from words.
            Remove all words that are not purely comprised of alphabetical characters
            Remove all words that have a length <= 1 character.

        :param text:
        :param morpho:
        :return:
        """
        text = text.lower()
        # remove html tags
        text = re.sub("<.*?>", " ", text)
#         morpho = self.morpho
#         if morpho:
#             text = self.transform_morpho(text)
        
        # Remove punctuation except '-'
        text = text.translate(self.translate_map).strip()
        text = re.sub(self.QUANTITY_FORMAT,"\g<1> \g<2>",text)
        text = re.sub("(\d)(in|for|with|by)(\d)","\g<1> \g<2> \g<3>",text)        
        
        if self.remove_digit:
            text = re.sub("[0-9]+[a-zA-Z]+",'', text)
            text = re.sub('[0-9]+',' ',text)
        if self.remove_short:
            text = " ".join(self.tokenizer(text))

        if self.phraser:
            text = " ".join(self.phraser[text.split()])

        # Remove double quote, single quote
        text = text.replace('"', '')
        # Remove double space
        text = re.sub('\s+', ' ', text)

        return text

    def transform_morpho(self, text):
        text = text.lower()
        # text = re.sub(self.QUANTITY_FORMAT,self.QUANTITY,text)
        text = re.sub("[a-zA-Z]{1,6}[-][0-9a-zA-Z]{2,18}", self.ALPHA_ALPHADIGIT, text)
        text = re.sub("[a-zA-Z]+[0-9]+", self.ALPHADIGIT, text)
        text = re.sub("[0-9]{1,5}[.][0-9a-zA-Z]+", self.DIGITALPHA, text)
        text = re.sub("[0-9]+[a-zA-Z]+", self.DIGITALPHA, text)
        text = re.sub('[0-9]+', self.DIGIT, text)
        return text

    def tokenizer(self, text):
        """
        Simple tokenizer, remove word with length equals 1
        :param text:
        :return:
        """
        tokens = [word for word in text.split() if len(word) > 1]
        return tokens

# if __name__ == '__main__':
#     tt = TextTransform(morpho=True)
#     text = "MÁY_ÉP TRÁI_CÂY PANASONIC MJ-DJ01SRA"
#     print(tt.transform(text))


In [4]:
df = pd.read_csv("train.csv").sort_values(by=["label_group"],ascending=True)

In [5]:
tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
df['matches'] = df['label_group'].map(tmp)
df['matches'] = df['matches'].apply(lambda x: ' '.join(x))

In [6]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches
3874,train_1646767365,1d7aadc7503b2b4539cc9a5fe41979dd.jpg,e925873ed09cd08f,Sarung celana wadimor original 100% dewasa dan anak hitam dan putih polos,258047,train_1646767365 train_1528423085 train_398181303
31859,train_1528423085,eec692257e74fcbc6cb63cb76d0f20e7.jpg,ea97861c926a71e3,WARNA RANDOM ACAK Sarung Celana Wadimor MURAH Celana Sarung WADIMOR,258047,train_1646767365 train_1528423085 train_398181303
6738,train_398181303,3301b8aaccea93d1098995ffbc537335.jpg,e9b5833e929e909c,SARUNG CELANA WADIMOR DEWASA HITAM POLOS SARCEL,258047,train_1646767365 train_1528423085 train_398181303
7613,train_2865605743,3977f4e7a47c73568c5e9fcb61723b4b.jpg,bfc3cc1cc636c14c,Wall Sticker / WallSticker - Submarine Measure - SK9222,297977,train_2865605743 train_1382500866
12367,train_1382500866,5d075d7eaa258052ab125c75c06293d6.jpg,838436c07dff19e4,RELIZA WALL STICKER PENGUKUR TINGGI BADAN JERAPAH STIKER DINDING XL8337,297977,train_2865605743 train_1382500866


In [7]:
tt = TextTransform(morpho=False)

In [8]:
df["clean_txt"] = df["title"].map(lambda d: tt.transform(d))

In [9]:
df[["title","clean_txt","label_group"]].head(100)

Unnamed: 0,title,clean_txt,label_group
3874,Sarung celana wadimor original 100% dewasa dan anak hitam dan putih polos,sarung celana wadimor original 100 dewasa dan anak hitam dan putih polos,258047
31859,WARNA RANDOM ACAK Sarung Celana Wadimor MURAH Celana Sarung WADIMOR,warna random acak sarung celana wadimor murah celana sarung wadimor,258047
6738,SARUNG CELANA WADIMOR DEWASA HITAM POLOS SARCEL,sarung celana wadimor dewasa hitam polos sarcel,258047
7613,Wall Sticker / WallSticker - Submarine Measure - SK9222,wall sticker wallsticker - submarine measure - sk9222,297977
12367,RELIZA WALL STICKER PENGUKUR TINGGI BADAN JERAPAH STIKER DINDING XL8337,reliza wall sticker pengukur tinggi badan jerapah stiker dinding xl8337,297977
15610,LVN COLLAGEN - ORIGINAL TERMURAH - LVN STROBERI - GARANSI UANG KEMBALI,lvn collagen - original termurah - lvn stroberi - garansi uang kembali,645628
32085,LVN COLLAGEN LVN STROBERI ORIGINAL 100% 1BOX ISI 10 SACHET,lvn collagen lvn stroberi original 100 1box isi 10 sachet,645628
27972,LVN COLLAGEN / STROBERI,lvn collagen stroberi,645628
11716,GROSIR LVN COLLAGEN / COLAGEN STROBERI PEMUTIH KULIT 1 BOX ISI 10 SACHET,grosir lvn collagen colagen stroberi pemutih kulit 1 box isi 10 sachet,645628
15195,TERMURAH LVN COLLAGEN STROBERI 1 BOX 10 SACHET,termurah lvn collagen stroberi 1 box 10 sachet,645628


In [10]:
def jaccard_metric(s1, s2):
    s1_tokens = set(s1.split())
    s2_tokens = set(s2.split())
    all_tokens = s1_tokens.union(s2_tokens)
    common_tokens = s1_tokens.intersection(s2_tokens)
#     print(common_tokens)
#     print(all_tokens)
    
    return len(common_tokens) * 1.0/ len(all_tokens)
def jaccard_similarities(mat):
    mat = mat.astype("float64")
    cols_sum = mat.getnnz(axis=0)
    ab = mat.T * mat

    # for rows
    aa = np.repeat(cols_sum, ab.getnnz(axis=0))
    # for columns
    bb = cols_sum[ab.indices]

    similarities = ab.copy()
    similarities.data /= (aa + bb - ab.data)

    return similarities
# def jaccard_dist(mat):
#     return  - jaccard_similarities(mat)

In [11]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [13]:
def get_prediction(dist, indices):
    df_raw = df.copy(deep=True)
    f1_scores = []
    thresholds = []
    for thres in np.arange(0.1, 1.0,0.1):
        tmp = []
        for i in tqdm(range(len(dist))):
            indices_large_thres = np.where(dist[i]<=thres)[0]
#             print(indices_large_thres)
            tmp.append(" ".join(post_ids_arr[indices[i][indices_large_thres]]))
        df_raw["pred_match"] = tmp
#         print(df_raw[["matches","pred_match"]].head())
        df_raw["f1"] = f1_score(df_raw["matches"],df_raw["pred_match"])
        score = df_raw["f1"].mean()
        f1_scores.append(score)
        thresholds.append(thres)
        print("F1 score: %.2f with threshold %.2f"%(score, thres))

In [14]:
def get_prediction_best(dist, indices):
    df_raw = df.copy(deep=True)
    thres = 0.9
    tmp = []
    for i in tqdm(range(len(dist))):
        indices_large_thres = np.where(dist[i]<=thres)[0]
#             print(indices_large_thres)
        tmp.append(" ".join(post_ids_arr[indices[i][indices_large_thres]]))
    df_raw["pred_match"] = tmp
    df_raw["f1"] = f1_score(df_raw["matches"],df_raw["pred_match"])
    score = df_raw["f1"].mean()
    print("F1 score: %.4f with threshold %.2f"%(score, thres))
    
    return df_raw

In [37]:
cf = TfidfVectorizer().fit(df["title"].tolist())
X = cf.transform(df["title"].tolist())
nn = NearestNeighbors(n_neighbors=50, n_jobs=-1,).fit(X)

dist, indices = nn.kneighbors(X)
get_prediction(dist, indices)

100%|██████████| 34250/34250 [00:00<00:00, 155711.67it/s]
 50%|█████     | 17233/34250 [00:00<00:00, 172324.41it/s]

F1 score: 0.49 with threshold 0.10


100%|██████████| 34250/34250 [00:00<00:00, 173341.35it/s]
100%|██████████| 34250/34250 [00:00<00:00, 177809.66it/s]

F1 score: 0.49 with threshold 0.20



100%|██████████| 34250/34250 [00:00<00:00, 171673.18it/s]

F1 score: 0.50 with threshold 0.30



 48%|████▊     | 16339/34250 [00:00<00:00, 163385.48it/s]

F1 score: 0.51 with threshold 0.40


100%|██████████| 34250/34250 [00:00<00:00, 165333.62it/s]
 51%|█████     | 17444/34250 [00:00<00:00, 174439.75it/s]

F1 score: 0.53 with threshold 0.50


100%|██████████| 34250/34250 [00:00<00:00, 166329.06it/s]
 99%|█████████▉| 34047/34250 [00:00<00:00, 169063.08it/s]

F1 score: 0.56 with threshold 0.60


100%|██████████| 34250/34250 [00:00<00:00, 169277.82it/s]
 98%|█████████▊| 33491/34250 [00:00<00:00, 167062.25it/s]

F1 score: 0.59 with threshold 0.70


100%|██████████| 34250/34250 [00:00<00:00, 166021.30it/s]
 96%|█████████▌| 32943/34250 [00:00<00:00, 161786.09it/s]

F1 score: 0.62 with threshold 0.80


100%|██████████| 34250/34250 [00:00<00:00, 163351.32it/s]


F1 score: 0.65 with threshold 0.90


In [17]:
df

100%|██████████| 34250/34250 [00:00<00:00, 180237.55it/s]


F1 score: 0.6475 with threshold 0.90


In [69]:
df_pred.drop(["image", "image_phash"],axis=1).tail(100)

Unnamed: 0,posting_id,title,label_group,matches,clean_txt,lang,pred_match,f1
7177,train_3792988510,FA52 FOCALLURE Full Coverage Concealer Liquid concealer,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,fa52 focallure full coverage concealer liquid concealer,en,train_649711084 train_3792988510 train_3130680059 train_3999530505 train_2711527717 train_3472877513,0.857143
13635,train_649711084,FOCALLURE Full Coverage Concealer Liquid concealer fa52,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,focallure full coverage concealer liquid concealer fa52,en,train_649711084 train_3792988510 train_3130680059 train_3999530505 train_2711527717 train_3472877513,0.857143
10753,train_3130680059,FA52 Focallure full coverage liquid concealer,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,fa52 focallure full coverage liquid concealer,en,train_3130680059 train_649711084 train_3792988510 train_3999530505 train_2711527717,0.769231
6013,train_2711527717,Official Distributor Focallure Full Coverage Liquid Concealer Original FA52 Concealer Liquid Ori,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,official distributor focallure full coverage liquid concealer original fa52 concealer liquid ori,en,train_2711527717 train_3792988510 train_649711084 train_3130680059 train_3999530505,0.769231
7178,train_3999530505,Fa52 Focallure Concealer Cair Full Coverage,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,fa52 focallure concealer cair full coverage,en,train_3999530505 train_3130680059 train_649711084 train_3792988510 train_2711527717,0.769231
31716,train_3472877513,READY STOCK Focallure Liquid Concealer,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,ready stock focallure liquid concealer,zh,train_3472877513 train_766724052 train_3792988510 train_649711084,0.666667
30790,train_766724052,FOCALLURE Long Lasting Liquid Concealer,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,focallure long lasting liquid concealer,en,train_766724052 train_4041939824 train_3472877513,0.363636
2054,train_630250743,FOCALLURE CONCEALER LIQUID / CAIR FA-52 /Rajamarket,16448490,train_766724052 train_3130680059 train_2711527717 train_3999530505 train_3792988510 train_3472877513 train_630250743 train_649711084,focallure concealer liquid cair fa-52 rajamarket,en,train_630250743,0.222222
14863,train_1127938265,3in1 baju tidur wanita/piyam 3in1,15978237,train_1127938265 train_3444022543,3 in 1 baju tidur wanita piyam 3 in 1,hu,train_1127938265,0.666667
25260,train_3444022543,PIYAMA 3IN1 EMBOS FIT XL . XXL / PANJAMAS 3 IN 1 CP,15978237,train_1127938265 train_3444022543,piyama 3 in 1 embos fit xl xxl panjamas 3 in 1 cp,da,train_3444022543,0.666667


In [72]:
def compare_cosine(X1, X2):
    return euclidean_distances(X1, X2)[0]

In [96]:
def retrieve_title(posting_id):
    return df[df["posting_id"]==posting_id]["title"].values[0]

In [97]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,clean_txt,lang
3874,train_1646767365,1d7aadc7503b2b4539cc9a5fe41979dd.jpg,e925873ed09cd08f,Sarung celana wadimor original 100% dewasa dan anak hitam dan putih polos,258047,train_1646767365 train_1528423085 train_398181303,sarung celana wadimor original 100 dewasa dan anak hitam dan putih polos,ms
31859,train_1528423085,eec692257e74fcbc6cb63cb76d0f20e7.jpg,ea97861c926a71e3,WARNA RANDOM ACAK Sarung Celana Wadimor MURAH Celana Sarung WADIMOR,258047,train_1646767365 train_1528423085 train_398181303,warna random acak sarung celana wadimor murah celana sarung wadimor,de
6738,train_398181303,3301b8aaccea93d1098995ffbc537335.jpg,e9b5833e929e909c,SARUNG CELANA WADIMOR DEWASA HITAM POLOS SARCEL,258047,train_1646767365 train_1528423085 train_398181303,sarung celana wadimor dewasa hitam polos sarcel,de
7613,train_2865605743,3977f4e7a47c73568c5e9fcb61723b4b.jpg,bfc3cc1cc636c14c,Wall Sticker / WallSticker - Submarine Measure - SK9222,297977,train_2865605743 train_1382500866,wall sticker wallsticker - submarine measure - sk9222,en
12367,train_1382500866,5d075d7eaa258052ab125c75c06293d6.jpg,838436c07dff19e4,RELIZA WALL STICKER PENGUKUR TINGGI BADAN JERAPAH STIKER DINDING XL8337,297977,train_2865605743 train_1382500866,reliza wall sticker pengukur tinggi badan jerapah stiker dinding xl8337,tr


In [100]:
posting_titles = {posting_id: title for (title, posting_id) in df[["title","posting_id"]].values}

In [63]:
cf = TfidfVectorizer(binary=True).fit(df["title"].tolist())
X = cf.transform(df["title"].tolist())

X_cosine =  cosine_similarity(X, X)

In [64]:
len(cf.vocabulary_)

25069

In [65]:

post_ids_arr = np.array([None] * len(df))
for idx, post_id in enumerate(df["posting_id"].tolist()):
    post_ids_arr[idx] = post_id
    
def get_prediction_with_similarity(X_similarity):
    df_raw = df.copy(deep=True)
    f1_scores = []
    thresholds = []
    for thres in np.arange(0.5, 0.7,0.01):
        tmp = []
        for j in tqdm(range(len(X_similarity))):
            indices_large_thes = np.where(X_similarity[j]>=thres)[0]
            tmp.append(" ".join(post_ids_arr[indices_large_thes]))
            
        df_raw["pred_match"] = tmp
        df_raw["f1"] = f1_score(df_raw["matches"],df_raw["pred_match"])
        score = df_raw["f1"].mean()
        f1_scores.append(score)
        thresholds.append(thres)
        
        print("F1 score: %.4f with threshold %.2f"%(score, thres))

get_prediction_with_similarity(X_cosine)

100%|██████████| 34250/34250 [00:08<00:00, 3881.11it/s]
  1%|          | 406/34250 [00:00<00:08, 4056.51it/s]

F1 score: 0.6591 with threshold 0.50


100%|██████████| 34250/34250 [00:08<00:00, 3941.39it/s]
  2%|▏         | 825/34250 [00:00<00:08, 4148.99it/s]

F1 score: 0.6606 with threshold 0.51


100%|██████████| 34250/34250 [00:08<00:00, 4139.03it/s]
  2%|▏         | 767/34250 [00:00<00:08, 3848.61it/s]

F1 score: 0.6608 with threshold 0.52


100%|██████████| 34250/34250 [00:08<00:00, 4149.74it/s]
  2%|▏         | 803/34250 [00:00<00:08, 4007.09it/s]

F1 score: 0.6615 with threshold 0.53


100%|██████████| 34250/34250 [00:08<00:00, 4160.23it/s]
  2%|▏         | 797/34250 [00:00<00:08, 4002.16it/s]

F1 score: 0.6610 with threshold 0.54


100%|██████████| 34250/34250 [00:08<00:00, 4086.52it/s]
  3%|▎         | 881/34250 [00:00<00:07, 4467.94it/s]

F1 score: 0.6610 with threshold 0.55


100%|██████████| 34250/34250 [00:08<00:00, 4213.95it/s]
  1%|▏         | 475/34250 [00:00<00:07, 4717.68it/s]

F1 score: 0.6599 with threshold 0.56


100%|██████████| 34250/34250 [00:08<00:00, 4212.58it/s]
  3%|▎         | 890/34250 [00:00<00:07, 4489.19it/s]

F1 score: 0.6580 with threshold 0.57


100%|██████████| 34250/34250 [00:08<00:00, 4201.24it/s]
  3%|▎         | 876/34250 [00:00<00:07, 4400.96it/s]

F1 score: 0.6559 with threshold 0.58


100%|█████████▉| 34220/34250 [00:08<00:00, 3894.94it/s]


KeyboardInterrupt: 

In [66]:
def get_prediction_best(X_similarity):
    thres = 0.53
    df_raw = df.copy(deep=True)
    tmp = []
    for j in tqdm(range(len(X_similarity))):
        indices_large_thes = np.where(X_similarity[j]>=thres)[0]
        tmp.append(" ".join(post_ids_arr[indices_large_thes]))

    df_raw["pred_match"] = tmp
    df_raw["f1"] = f1_score(df_raw["matches"],df_raw["pred_match"])
    score = df_raw["f1"].mean()

    print("F1 score: %.4f with threshold %.2f"%(score, thres))
    
    return df_raw

In [67]:
df_pred = get_prediction_best(X_cosine)
df_pred = df_pred.sort_values(by=["label_group","f1"],ascending=True).drop(["image", "image_phash"],axis=1)

100%|██████████| 34250/34250 [00:09<00:00, 3754.53it/s]


F1 score: 0.6615 with threshold 0.53


In [68]:
df_pred.head(10)

Unnamed: 0,posting_id,title,label_group,matches,clean_txt,pred_match,f1
3874,train_1646767365,Sarung celana wadimor original 100% dewasa dan anak hitam dan putih polos,258047,train_1646767365 train_1528423085 train_398181303,sarung celana wadimor original 100 dewasa dan anak hitam dan putih polos,train_1646767365 train_398181303 train_411270231 train_3932348722 train_3097122661 train_2042551639 train_3100268870 train_2719185123 train_1429585174,0.333333
31859,train_1528423085,WARNA RANDOM ACAK Sarung Celana Wadimor MURAH Celana Sarung WADIMOR,258047,train_1646767365 train_1528423085 train_398181303,warna random acak sarung celana wadimor murah celana sarung wadimor,train_1528423085 train_411270231,0.4
6738,train_398181303,SARUNG CELANA WADIMOR DEWASA HITAM POLOS SARCEL,258047,train_1646767365 train_1528423085 train_398181303,sarung celana wadimor dewasa hitam polos sarcel,train_1646767365 train_398181303 train_411270231 train_2042551639 train_1489322613,0.5
7613,train_2865605743,Wall Sticker / WallSticker - Submarine Measure - SK9222,297977,train_2865605743 train_1382500866,wall sticker wallsticker - submarine measure - sk9222,train_2865605743,0.666667
12367,train_1382500866,RELIZA WALL STICKER PENGUKUR TINGGI BADAN JERAPAH STIKER DINDING XL8337,297977,train_2865605743 train_1382500866,reliza wall sticker pengukur tinggi badan jerapah stiker dinding xl8337,train_1382500866,0.666667
17455,train_320127748,"LV.N COLLAGEN 1 BOX ""BELI ECER HARGA GROSIR"" ORIGINAL 100% BERGARANSI murah isi 10 Pc kolagen bpom",645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,lv n collagen 1 box beli ecer harga grosir original 100 bergaransi murah isi 10 pc kolagen bpom,train_320127748,0.153846
18538,train_3419521073,LVN STRAWBERRY (isi 10 Bgks),645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,lvn strawberry isi 10 bgks,train_4282018978 train_3419521073,0.285714
20122,train_2587129863,[ORIGINAL] LVN COLLAGEN / POMEGLOW COLLAGEN 1 BOX ISI 10 SACHET / GLOWING/ PEMUTIH WAJAH BADAN,645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,original lvn collagen pomeglow collagen 1 box isi 10 sachet glowing pemutih wajah badan,train_318767180 train_4282018978 train_2587129863,0.4
15610,train_2070644662,LVN COLLAGEN - ORIGINAL TERMURAH - LVN STROBERI - GARANSI UANG KEMBALI,645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,lvn collagen - original termurah - lvn stroberi - garansi uang kembali,train_2070644662 train_2419208039 train_789743463 train_2359409116,0.5
31001,train_4282018978,Lvn Collagen / Lvn Strawberry Original BPOM RI Isi 1 Box 10 Sachet,645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,lvn collagen lvn strawberry original bpom ri isi 1 box 10 sachet,train_2149563017 train_789743463 train_4282018978 train_2587129863 train_3419521073 train_346999902,0.555556


In [60]:
a = "train_1528423085 train_411270231 train_3932348722 train_3448357370 train_3097122661"
b = "train_1646767365 train_398181303 train_3932348722 train_3097122661 train_2042551639 train_3100268870 train_2719185123"
c = "train_1646767365 train_398181303 train_3097122661 train_2042551639 train_1489322613"

t = " ".join(set(a.split()).union(set(b.split())).union(set(c.split())))
f1_score(pd.Series("train_1646767365 train_1528423085 train_398181303"), pd.Series(t))

array([0.42857143])

In [None]:
def post_finetune(df):
    pred_matches = df["pred_match"].tolist()
    link_nn = {}
    
    for i in range(len(pred_matches)):
        
    
    
    return df_r

In [None]:
from langdetect import detect
df["lang"] = df["title"].map(lambda d: detect(d))contr

In [11]:
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,lang
3874,train_1646767365,1d7aadc7503b2b4539cc9a5fe41979dd.jpg,e925873ed09cd08f,Sarung celana wadimor original 100% dewasa dan anak hitam dan putih polos,258047,train_1646767365 train_1528423085 train_398181303,id
31859,train_1528423085,eec692257e74fcbc6cb63cb76d0f20e7.jpg,ea97861c926a71e3,WARNA RANDOM ACAK Sarung Celana Wadimor MURAH Celana Sarung WADIMOR,258047,train_1646767365 train_1528423085 train_398181303,id
6738,train_398181303,3301b8aaccea93d1098995ffbc537335.jpg,e9b5833e929e909c,SARUNG CELANA WADIMOR DEWASA HITAM POLOS SARCEL,258047,train_1646767365 train_1528423085 train_398181303,de
7613,train_2865605743,3977f4e7a47c73568c5e9fcb61723b4b.jpg,bfc3cc1cc636c14c,Wall Sticker / WallSticker - Submarine Measure - SK9222,297977,train_2865605743 train_1382500866,en
12367,train_1382500866,5d075d7eaa258052ab125c75c06293d6.jpg,838436c07dff19e4,RELIZA WALL STICKER PENGUKUR TINGGI BADAN JERAPAH STIKER DINDING XL8337,297977,train_2865605743 train_1382500866,de


In [12]:
df.to_csv("train_lang.csv",index=None)

In [21]:
df_diversity = df.groupby(["label_group"])["lang"].nunique().reset_index().sort_values(by=["lang"],ascending=False)

In [22]:
df_diversity.head()

Unnamed: 0,label_group,lang
1445,562358068,12
1677,656698835,10
2857,1091404026,9
2994,1141798720,9
412,159351600,8


In [25]:
# df[df["label_group"]==159351600]

In [26]:
df_diversity.query("lang >= 2").shape

(6468, 2)

In [29]:
df_diversity.query("lang == 1").shape

(4546, 2)

In [32]:
df.groupby(["lang"])["label_group"].count().reset_index().sort_values(by=["label_group"],ascending=False)

Unnamed: 0,lang,label_group
13,id,11585
6,en,11278
5,de,4396
28,tl,1229
1,ca,647
24,so,612
8,et,411
30,vi,363
14,it,362
18,no,353


In [33]:
df[df["lang"]=="vi"]

Unnamed: 0,posting_id,image,image_phash,title,label_group,matches,lang
27972,train_2419208039,d173d12f8d2767512db31a01414047e4.jpg,c89817483f3f398e,LVN COLLAGEN / STROBERI,645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,vi
15195,train_789743463,726574cf844c5dac0669f926d09c8931.jpg,c634f2e078e2b3cc,TERMURAH LVN COLLAGEN STROBERI 1 BOX 10 SACHET,645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,vi
23690,train_2359409116,b19a6f887d8efd3c0bf2aa9845b03c23.jpg,d664a89ba1667ee0,LVN COLLAGEN / STROBERI,645628,train_2070644662 train_2149563017 train_2419208039 train_318767180 train_789743463 train_3645016213 train_4282018978 train_320127748 train_2587129863 train_3419521073 train_2359409116 train_1123490626,vi
16807,train_1621765047,7e8ca0dd62a2172d573801de12d80e7c.jpg,ba02c1c93e37b678,WAJAN 22CM PENGGORENGAN TEFLON MAXIM VALENTINO FRYPAN 22 CM,6381662,train_188408030 train_1621765047,vi
34051,train_1622896507,fe78729e6d14b8cb407f2135befb9466.jpg,e6c6993399338999,AVIONE LIQUIFIED LONGWEAR LIPSTICK,11893592,train_1622896507 train_576266310,vi
16942,train_1559143544,7f844d8f53e49ebf90732c6d4e6b45cc.jpg,ed3c94639292cb1e,PULPEN GEL PEN KENKO HI TECH H 0.28 MM BALLPOINT BOLPEN GELPEN HI-TECH-H BLACK TINTA WARNA HITAM,18848931,train_81264779 train_1559143544,vi
12283,train_597539555,5c6e7213af0e6f36e0c794c2bafcba4d.jpg,d4c3367e4925265e,SHANNEN CREAMY LIP PAINT,19273520,train_565629063 train_597539555 train_1110212222 train_878569709,vi
844,train_1110212222,06b3f3f16b1a435bd46bf23ed3f6fca4.jpg,ae0181c79e1e3e9e,SHANNEN LIPSTICK CREAMY LIP PAINT,19273520,train_565629063 train_597539555 train_1110212222 train_878569709,vi
26192,train_1191311325,c3f2b16194933661c58370b901949ff8.jpg,dab0d8fc33ccc107,PAKAI RESLETING TOTEBAG/TOTE BAG,55214780,train_2164938299 train_1191311325,vi
25089,train_3595779321,bb99762ea959fe1264832ead276aefeb.jpg,ef90c272e6871a0f,SARUNG TANGAN I GLOVE CAPACITIVE SMARTPHONE TOUCHSCREEN,66247839,train_972685049 train_138694195 train_121718242 train_398849180 train_4183994442 train_3227415372 train_1198874187 train_539770983 train_2919344198 train_3945694365 train_3884647552 train_2554398684 train_770123511 train_4236194881 train_1099355015 train_2711780517 train_4110688751 train_409083230 train_3331058393 train_3121886520 train_1431545311 train_1273454865 train_2548556909 train_3258335386 train_1313853216 train_3978600303 train_3595779321 train_2540759500 train_859490431 train_45716986 train_1858791960 train_1878157145 train_3600825315 train_3262765635,vi
