In [1]:
import os
import sys
import pandas as pd
import numpy as np
import math
import random
from gensim import corpora
from gensim.similarities import SparseMatrixSimilarity
from src.preprocessing import remove_stop_words
from src.common import create_final_data

  return torch._C._cuda_getDeviceCount() > 0


## Loading/Preprocessing Data

In [797]:
def chunk_data():
    chunk_size = 100000
    batch = 1
    for chunk in pd.read_json('data/base/offers_corpus_english_v2.json', lines=True, nrows= 100000000000000, chunksize=chunk_size):
        chunk.to_json('data/base/product_corpus/chunk' + str(batch) + '.json')
        batch += 1

In [3]:
def generate_computer_data():
    chunk_size = 100000
    computer_df = pd.DataFrame()
    for chunk in pd.read_json('data/base/offers_corpus_english_v2.json', lines=True, nrows= 100000000000000, chunksize=chunk_size):
        computer_df = computer_df.append(chunk[chunk['category'].values == 'Computers_and_Accessories'])
    return computer_df

In [800]:
def extract_key_features_OLD(computer_df):
    left = computer_df[['id_left', 'title_left',
                        'description_left', 'cluster_id_left']]
    right = computer_df[['id_right', 'title_right',
                         'description_right', 'cluster_id_right']]

    left = left.rename(columns={'id_left': 'id', 
                        'title_left': 'title',
                        'description_left': 'description',
                        'cluster_id_left': 'cluster_id'
                       })
    
    right = right.rename(columns={'id_right': 'id',
                          'title_right': 'title',
                          'description_right': 'description',
                          'cluster_id_right': 'cluster_id'
                         })
    
    ret = left.append(right)
    ret = ret.drop_duplicates(subset=['id'])
    
    return ret

In [801]:
computer_df = pd.read_csv('data/base/computer_wdc_whole_no_duplicates.csv')

In [5]:
computer_df = generate_computer_data().drop_duplicates('title')

In [6]:
computer_df.head()

Unnamed: 0,brand,category,cluster_id,description,id,identifiers,keyValuePairs,price,specTableContent,title
41,,Computers_and_Accessories,1554982,,41,"[{'/mpn': '[nxm81eh034]'}, {'/gtin13': '[47131...","{'categorie': 'laptops', 'merk': 'acer', 'prod...",,categorie laptops merk acer productserie aspir...,acer aspire e1 522 65208g1tmnkk specificaties ...
55,,Computers_and_Accessories,15189423,description,55,[{'/productID': '[k1009900]'}],,,,kohler lavatory bonnet 1009900 bn ferguson
72,hp enterprise,Computers_and_Accessories,14583973,description hp third party rackmount option ki...,72,"[{'/sku': '[231122b21]'}, {'/mpn': '[231122b21...","{'category': 'hp option', 'sub category': 'rac...",,specifications category hp option sub category...,"null , 231122 b21 hp 3rd party rail kit ml370 ..."
75,hp enterprise,Computers_and_Accessories,3859891,description hp proliant dl380 g6 rack mountabl...,75,[{'/sku': '[491505001]'}],,,,"null , 491505 001 hp dl380 g6 e5504 2 00ghz 4g..."
91,,Computers_and_Accessories,10106149,,91,[{'/mpn': '[ds1010c101]'}],,,,ds1010c 101


In [804]:
all_clusters = set(computer_df['cluster_id'].values)

In [127]:
num = 0
for cluster in all_clusters:
    if len(computer_df.loc[computer_df['cluster_id'].values == cluster]) >= 2 and len(computer_df.loc[computer_df['cluster_id'].values == cluster]) < 80:
        num += 1

print(num)

52530


In [805]:
len(all_clusters)

295932

## Building Dictionary and Similarity

In [806]:
def extract_key_features(cluster):
    new_cluster = cluster.loc[:, ("id", "description", "title")]
    new_cluster["title"] = new_cluster["title"].map(lambda x: remove_stop_words(x))
    new_cluster["description"] = new_cluster["description"].map(lambda x: remove_stop_words(str(x)))
    new_cluster["titleDesc"] = new_cluster["title"].map(lambda x: x.split(" ")) + new_cluster["description"].map(lambda x: x.split(" ")).map(lambda x: x[0:6])
    return new_cluster

In [799]:
def get_pos_clusters(df):
    MAX_CLUSTER_SIZE = 80
    valid_clusters = (((df['cluster_id'].value_counts() > 1) & 
                        (df['cluster_id'].value_counts() <= MAX_CLUSTER_SIZE)))

    valid_clusters = list(valid_clusters[valid_clusters == True].index)
    all_clusters = df[df['cluster_id'].isin(valid_clusters)]['cluster_id'].values
    return set(all_clusters)

In [807]:
pos_clusters = list(get_pos_clusters(computer_df))

cluster = computer_df.loc[computer_df["cluster_id"].values == pos_clusters[0]].copy()

cluster = extract_key_features(cluster)

dictionary = corpora.Dictionary(cluster["titleDesc"])

cluster_dict = [dictionary.doc2bow(title) for title in cluster["title"].map(lambda x: x.split(" "))]

index = SparseMatrixSimilarity(cluster_dict, num_features=len(dictionary))

index[cluster_dict]

array([[1.        , 0.87705797, 0.7526177 , 0.76271284],
       [0.87705797, 1.0000001 , 0.8581164 , 0.71151245],
       [0.7526177 , 0.8581164 , 1.0000001 , 0.8291561 ],
       [0.76271284, 0.71151245, 0.8291561 , 1.        ]], dtype=float32)

In [808]:
def combinations(total, choose):
    return int(math.factorial(total) / (math.factorial(choose) * math.factorial(total - choose)))

In [833]:
def create_pos_from_cluster(data, cluster_id):
    MAX_PAIRS = 16
    cluster = data.loc[data["cluster_id"].values == cluster_id]
    cluster = extract_key_features(cluster)
    max_combos = combinations(len(cluster), 2)
    
    dictionary = corpora.Dictionary(cluster["titleDesc"])
    cluster_dict = [dictionary.doc2bow(title) for title in cluster["title"].map(lambda x: x.split(" "))]
    sim_matrix = np.array(SparseMatrixSimilarity(cluster_dict, num_features=len(dictionary)))
    
    for row in range(sim_matrix.shape[0]):
        for column in range(sim_matrix.shape[1]):
            if (row >= column):
                sim_matrix[row][column] = 100
        
    if max_combos < MAX_PAIRS:
        MAX_PAIRS = max_combos
    
    hard_pos = MAX_PAIRS // 2
    random_pos = MAX_PAIRS - hard_pos
    
    pairs = []
    for x in range(hard_pos):
        min_sim = np.unravel_index(sim_matrix.argmin(), sim_matrix.shape)
        pair = [cluster["title"].iloc[min_sim[0]], cluster["title"].iloc[min_sim[1]], 1]
        pairs.append(pair)
        sim_matrix[min_sim[0]][min_sim[1]] = 100
    
    avail_indices = np.argwhere(sim_matrix != 100)
    for x in range(random_pos):
        ran_idx = random.sample(list(range(len(avail_indices))), 1)
        choice = avail_indices[ran_idx][0]
        pair = [cluster["title"].iloc[choice[0]],
                cluster["title"].iloc[choice[1]], 1]
        pairs.append(pair)
        avail_indices = np.delete(avail_indices, ran_idx, 0)
    
    return pd.DataFrame(pairs, columns=["title_one", "title_two", "label"])

In [834]:
def create_neg_from_cluster(data, cluster_id, all_clusters):
    cluster = data.loc[data["cluster_id"].values == cluster_id]
    cluster = extract_key_features(cluster)
    pairs = []
    hard_neg = len(cluster) // 2
    
    for row in range(hard_neg):
        neg_cluster_id = cluster_id
        
        while neg_cluster_id == cluster_id:
            neg_cluster_id = random.choice(all_clusters)
        
        neg_cluster = data.loc[data["cluster_id"].values == neg_cluster_id].copy()
        neg_cluster = extract_key_features(neg_cluster)
        neg_cluster = pd.concat([pd.DataFrame([cluster.iloc[row].values], columns=["id", "description", "title", "titleDesc"]),
                                 neg_cluster])
        
        dictionary = corpora.Dictionary(neg_cluster["titleDesc"])
        neg_cluster_dict = [dictionary.doc2bow(title) for title in neg_cluster["title"].map(lambda x: x.split(" "))]
        sim_matrix = np.array(SparseMatrixSimilarity(neg_cluster_dict, num_features=len(dictionary)))
        max_val = sim_matrix[0][1:].argmax() + 1
        
        pair = [cluster["title"].iloc[row], neg_cluster["title"].iloc[max_val], 0]
        pairs.append(pair)
    
    for row in range(hard_neg, len(cluster)):
        neg_cluster_id = cluster_id
        
        while neg_cluster_id == cluster_id:
            neg_cluster_id = random.choice(all_clusters)
        
        neg_cluster = data.loc[data["cluster_id"].values == neg_cluster_id].copy()
        neg_cluster = extract_key_features(neg_cluster)
        neg_title = neg_cluster["title"].iloc[random.choice(list(range(len(neg_cluster))))]
        
        pair = [cluster["title"].iloc[row], neg_title, 0]
        pairs.append(pair)
    
    return pd.DataFrame(pairs, columns=["title_one", "title_two", "label"])

In [823]:
create_pos_from_cluster(computer_df, 131074)

Unnamed: 0,title_one,title_two,label
0,cx 2g10 300 emc gb 10k 3 5 fc al,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,1
1,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,1
2,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,1
3,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,1
4,cx 2g10 300 emc gb 10k 3 5 fc al,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,1
5,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,cx 2g10 300 emc gb 10k 3 5 fc al,1


In [824]:
create_neg_from_cluster(computer_df, 131074, pos_clusters)

0
1


Unnamed: 0,title_one,title_two,label
0,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,acer aspire 7736zg lcd ekran mat dataservis te...,0
1,cx 2g10 300 emc gb 10k 3 5 fc al,lenovo notebook sleeve gx40m66708 shuttle case...,0
2,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,man3735mp fujitsu 72 8 gb u160 nhp 10k new who...,0
3,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,cisco sp c220 m5sx w 1x3106 1x16gb mem 12g buy...,0


In [836]:
computer_train_wdc_pos = pd.DataFrame(columns=["title_one", "title_two", "label"])
computer_train_wdc_neg = pd.DataFrame(columns=["title_one", "title_two", "label"])

In [837]:
# Positive data creation
for cluster in pos_clusters:
    computer_train_wdc_pos = computer_train_wdc_pos.append(create_pos_from_cluster(computer_df, cluster))

In [838]:
# Negative data creation
for cluster in pos_clusters:
    computer_train_wdc_neg = computer_train_wdc_neg.append(create_neg_from_cluster(computer_df, cluster, pos_clusters))

In [839]:
computer_train_wdc_pos

Unnamed: 0,title_one,title_two,label
0,cx 2g10 300 emc gb 10k 3 5 fc al,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,1
1,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,1
2,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,1
3,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,cx 2g10 300 emc gb 10k 3 5 fc al,1
4,cx 2g10 300 emc gb 10k 3 5 fc al,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,1
...,...,...,...
10,microsoft 5qh 00001 natural ergonomic keyborad us,microsoft natural ergonomic keyboard 4000 wire...,1
11,microsoft natural ergonomic keyboard 4000 busi...,microsoft 5qh 00001 natural ergonomic keyborad us,1
12,microsoft natural ergonomic keyboard 4000 busi...,microsoft natural ergonomic keyboard 4000 busi...,1
13,microsoft natural ergonomic keyboard 4000 busi...,microsoft natural ergo keyboard 4000,1


In [840]:
computer_train_wdc_neg

Unnamed: 0,title_one,title_two,label
0,cx 2g10 300 emc gb 10k 3 5 fc al new wholesale...,sony vaio vpcz116gh b batarya pil retro sv 6c ...,0
1,cx 2g10 300 emc gb 10k 3 5 fc al,ventilador aerocool shark blue 12cm 1500rpm 4x...,0
2,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd,hp pavilion dv8 1170eo batarya pil retro 12c 5...,0
3,cx 2g10 300 emc gb 2gb 10k 3 5 fc hdd 2 pack w...,fe 14589 01 hp 36 4 gb ultra scsi 3,0
0,lacie 3tb d2 usb 3 0 professional desktop stor...,kingston technology system specific memory 32m...,0
...,...,...,...
1,microsoft 5qh 00001 natural ergonomic keyborad us,belkin patch cable rj 45 50 cm cat 5e red,0
2,microsoft natural ergonomic keyboard 4000 busi...,intermec sr61bl barcode scanner cb 001 handhel...,0
3,seamless polka dot motley texture abstract vec...,shuttle xpc slim ds57uw10 celeron 3205u 1 5 gh...,0
4,microsoft natural ergonomic keyboard 4000 wire...,zalman zm f2rl 92mm prijzen tweakers,0


In [842]:
computer_train_wdc = create_final_data(computer_train_wdc_pos, computer_train_wdc_neg)

In [843]:
computer_train_wdc

Unnamed: 0,title_one,title_two,label
1,hp envy 14 1007tx batarya pil retro 379164 lx ...,hp mini sas 0 5 prijzen tweakers,0
4,dolphin 6500 accessories power battery kit cod...,gateway nv79c47u batarya pil retro 286195 xt 9...,0
1,lenovo 4x70f28591 prijzen tweakers,acer aspire 5742z lcd ekran panel floresanl da...,0
7,datalogic cable terminal cod 95acc1049 94a051015,v7 battery hp probook 450 455 g3 r104 macconne...,0
13,zebra symbol battery extended capacity 3600mah...,mc55 mc65 extended capacity spare battery 3600...,1
...,...,...,...
1,memoria ddr3 8gb 1333 mhz pc 10600 transcend,memoria ddr3 8gb 1333 mhz pc 10600 transcend p...,1
7,df300a4950 hp 300 gb 15k 3 5 sp sas 2 pack,df300a4950 hp 300 gb 15k 3 5 sp sas 10 pack,1
0,500662 s21 hp 8gb 1x8gb pc3 10600 rdimm,500662 s21 hp 8gb 1x8gb pc3 10600 rdimm new wh...,1
1,macbook pro retina 13 mf839e,x143k 147gb 10k sas 2 5 inch hard drive stock ...,0


In [844]:
computer_train_wdc.to_csv('data/train/wdc_computers.csv')