# This notebook was used to visualize information about cluster distributions and to join information to final category domains to save them for later use

In [1]:
import os
import pandas as pd
import progressbar
import json

In [None]:
data_path = '../src/data'
mapping_corpus_path = data_path + r'/product/lspc2020_to_tablecorpus'
mapping_corpus_path_2 = data_path + r'/product/lspc2020_to_tablecorpus/Cleaned'
table_corpus_path = data_path + r'/product/product_top100/cleaned'
table_corpus_path_with_id = data_path + r'/product/product_top100/cleaned/with_id'
table_corpus_path2 = data_path + r'/product/product_minimum3/cleaned/with_id'
mapping_corpus_path_all = data_path + r'/product/lspcV2020'

## Get information from preprocessed cluster amounts to derive histograms

In [4]:
# get dictionary
data_path_2 = '../src/data/product/lspc2020_to_tablecorpus/Cleaned/allocation_amount_only_set_dict.json'

In [6]:
with open(data_path_2) as f:
    data_2=json.load(f)

In [None]:
df_set=pd.DataFrame.from_dict(data_2, orient='index')
df_set=df_set.reset_index().rename(columns={0:"Amount",'index':"cluster_id"})

In [None]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there, so 1,6 million clusters remain
df_set=df_set[df_set['Amount']>1]

In [None]:
df_set.set_index('cluster_id').describe().T.round()

In [None]:
df_set.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
# filter on clusters with more entries to get information, here are still 20k clusters left
df_15=df_set[df_set['Amount']>15]
df_15

In [None]:
df_15.set_index('cluster_id').describe().T.round()

In [None]:
df_15.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
# filter on clusters with more entries to get information, here are still 20k clusters left
df_150=df_set[df_set['Amount']>150]
df_150

In [None]:
df_150.set_index('cluster_id').describe().T.round()

In [None]:
df_150.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
df_15_500=df_15[df_15['Amount']<500]

In [None]:
df_15_500.plot(x='cluster_id',y='Amount',kind='hist')

## This code part was used in an earlier stage to join data of electronics and clothes with complete information provided in the corpus

In [None]:
# get dictionaries
electronics_path = '../src/data/product/product_electronics_v2/electronics_dict.json'
clothes_path = '../src/data/product/product_clothes_v2/clothes_dict.json'
mapping_corpus_path_2 = '../src/data/product/lspc2020_to_tablecorpus/Cleaned'

In [None]:
with open(electronics_path) as f:
    electronics_data=json.load(f)

In [None]:
with open(clothes_path) as f:
    clothes_data=json.load(f)

In [None]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_electronics={}
for value in electronics_data.values():
    cleaned_dictionary_electronics.update(value)

In [None]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_clothes={}
for value in clothes_data.values():
    cleaned_dictionary_clothes.update(value)

In [None]:
#put the dictionaries into dataframes 
df_electronics=pd.DataFrame.from_dict(cleaned_dictionary_electronics, orient='index')
df_clothes=pd.DataFrame.from_dict(cleaned_dictionary_clothes, orient='index')

In [None]:
# use the cleaned tables to append into a series and the get a dataframe from that with the remaining cluster ids
count = 0
data=[]
with progressbar.ProgressBar(max_value=len(zip_files_mapping)) as bar:
    for zip_file in zip_files_mapping:
        print('/{}'.format(zip_file))
        df = pd.read_json(mapping_corpus_path_2 + '/{}'.format(zip_file), compression='gzip', lines=True)
        data.append(df)
        count += 1
        bar.update(count)
df_large= pd.concat(data, ignore_index=True)
#df.large.to_json(os.path.join(mapping_corpus_path_2, 'concatentation'), compression='gzip', orient='records', lines=True)

In [None]:
zip_files_mapping = [file for file in os.listdir(mapping_corpus_path_all) if file.endswith('.json.gz')]
# use the cleaned tables to append into a series and the get a dataframe from that with the remaining cluster ids
count = 0
data=[]
with progressbar.ProgressBar(max_value=len(zip_files_mapping)) as bar:
    for zip_file in zip_files_mapping:
        print('/{}'.format(zip_file))
        df = pd.read_json(mapping_corpus_path_all + '/{}'.format(zip_file), compression='gzip', lines=True)
        data.append(df)
        count += 1
        bar.update(count)
df_large_all= pd.concat(data, ignore_index=True)

In [None]:
#match product information to cleaned clusters
df_large_matched = df_large.merge(df_large_all[['cluster_id','url','name','description','brand']], left_on=['cluster_id','url'], right_on = ['cluster_id','url'], how='left')

In [None]:
df_large_matched.to_json(os.path.join(mapping_corpus_path_2, 'df_large_matched'), compression='gzip', orient='records', lines=True)

In [None]:
#fill up missing values in both product category data frames to be able to split the tuples up
df_electronics_filtered=df_electronics.applymap(lambda x: [0,0] if x is None else x)
df_clothes_filtered=df_clothes.applymap(lambda x: [0,0] if x is None else x)

In [None]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_electronics_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_electronics_filtered.columns)) as bar:
    for i in range(len(df_electronics_filtered.columns)):
        df_electronics_cleaned = df_electronics_cleaned.append(pd.DataFrame(df_electronics_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_electronics_filtered.index))
        count += 1
        bar.update(count)

In [None]:
#clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_clothes_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_clothes_filtered.columns)) as bar:
    for i in range(len(df_clothes_filtered.columns)):
        df_clothes_cleaned = df_clothes_cleaned.append(pd.DataFrame(df_clothes_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_clothes_filtered.index))
        count += 1
        bar.update(count)

In [None]:
#rename the columns to be able to join them into the cluster_id table
df_electronics_cleaned=df_electronics_cleaned.reset_index().rename(columns={'index':"brand"})
df_electronics_cleaned

In [None]:
#rename the columns to be able to join them into the cluster_id table
df_clothes_cleaned=df_clothes_cleaned.reset_index().rename(columns={'index':"brand"})
df_clothes_cleaned

In [None]:
#join the tables to the cluster tables by using left joins
#filled up zero values will be discarded by the join condition
df_joined_electronics = df_large.merge(df_electronics_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')

In [None]:
df_joined_electronics = pd.read_json(os.path.join(mapping_corpus_path_2, 'joined_electronics'), compression='gzip', orient='records', lines=True)

In [None]:
#join the tables to the cluster tables by using left joins 
#filled up zero values will be discarded by the join condition
df_joined_clothes = df_large.merge(df_clothes_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')

In [None]:
df_joined_clothes = pd.read_json(os.path.join(mapping_corpus_path_2, 'joined_clothes'), compression='gzip', orient='records', lines=True)
df_joined_clothes

# Cluster statistics for product category electronics

In [None]:
df_joined_electronics = pd.read_json(os.path.join(mapping_corpus_path_2, 'joined_electronics_v2'), compression='gzip', orient='records', lines=True)

In [None]:
df_grouped_electronics = df_joined_electronics.groupby('cluster_id').count()

In [None]:
# only look at clusters that have at least one brand associated
df_set_electronics = df_grouped_electronics[df_grouped_electronics['brand_y']>0].reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

In [None]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there, so 1,6 million clusters remain
df_set_electronics=df_set_electronics[df_set_electronics['Amount']>1]
df_set_electronics

In [None]:
df_set_electronics.set_index('cluster_id').describe().T.round()

In [None]:
df_set_electronics.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
# filter on clusters with more entries to get information, here are still 20k clusters left
df_10_electronics=df_set_electronics[df_set_electronics['Amount']>10]
df_10_electronics

In [None]:
df_10_electronics.set_index('cluster_id').describe().T.round()

In [None]:
df_10_electronics.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
df_15_electronics=df_set_electronics[df_set_electronics['Amount']>15]
df_15_electronics

In [None]:
df_15_electronics.set_index('cluster_id').describe().T.round()

In [None]:
df_15_electronics.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
df_25_electronics=df_set_electronics[df_set_electronics['Amount']>25]
df_25_electronics

In [None]:
df_25_electronics.set_index('cluster_id').describe().T.round()

In [None]:
df_25_electronics.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
#merge brand name to cluster amount
df_cluster_brand = df_15_electronics[df_15_electronics['Amount']<400].merge(df_joined_electronics.dropna()[['cluster_id','brand_y']].drop_duplicates('cluster_id', keep='last'), left_on=['cluster_id'], right_on = ['cluster_id'], how='left')
df_cluster_brand

In [None]:
#get the top clusters per brand
df_top_clusters = df_cluster_brand.sort_values(['Amount'], ascending=False).drop_duplicates(subset=["brand_y"], keep="first")
df_top_clusters

# Cluster statistics for product category clothes

In [None]:
df_joined_clothes = pd.read_json(os.path.join(mapping_corpus_path_2, 'joined_clothes_v2'), compression='gzip', orient='records', lines=True)

In [None]:
df_grouped_clothes = df_joined_clothes.groupby('cluster_id').count()

In [None]:
# only look at clusters that have at least one brand associated
df_set_clothes = df_grouped_clothes[df_grouped_clothes['brand_y']>0].reset_index()[['cluster_id','table_id']].rename(columns={'table_id':'Amount'})

In [None]:
# We discard all clusters with less than 2 entries, cause we cannot match anything there, so 1,6 million clusters remain
df_set_clothes=df_set_clothes[df_set_clothes['Amount']>1]
df_set_clothes

In [None]:
df_set_clothes.set_index('cluster_id').describe().T.round()

In [None]:
df_set_clothes.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
df_10_clothes=df_set_clothes[df_set_clothes['Amount']>10]
df_10_clothes

In [None]:
df_10_clothes.set_index('cluster_id').describe().T.round()

In [None]:
df_15_clothes=df_set_clothes[df_set_clothes['Amount']>15]
df_15_clothes

In [None]:
df_15_clothes.set_index('cluster_id').describe().T.round()

In [None]:
df_15_clothes.plot(x='cluster_id',y='Amount',kind='hist')

In [None]:
df_joined_clothes[df_joined_clothes['cluster_id']==78499693]

In [None]:
#merge brand name to cluster amount
df_cluster_brand_clothes = df_15_clothes[df_15_clothes['Amount']<400].merge(df_joined_clothes.dropna()[['cluster_id','brand_y']].drop_duplicates('cluster_id', keep='last'), left_on=['cluster_id'], right_on = ['cluster_id'], how='left')
df_cluster_brand_clothes

In [None]:
#get the top clusters per brand
df_top_clusters_clothes = df_cluster_brand_clothes.sort_values(['Amount'], ascending=False).drop_duplicates(subset=["brand_y"], keep="first")
df_top_clusters_clothes

In [None]:
df_joined_clothes[(df_joined_clothes['cluster_id']==22374915)]

# Get information about the cluster distribution per table to get a first glance at which tables for which clusters have an overlap to get good training dat

## Have at first a look at electronic products¶


In [None]:
df_grouped_electronics_tables = df_joined_electronics.groupby('table_id').count()

In [None]:
# only look at clusters that have at least one brand associated
df_set_electronics_tables = df_grouped_electronics_tables[df_grouped_electronics_tables['brand']>0].reset_index()[['cluster_id','table_id']].rename(columns={'cluster_id':'Amount'})

In [None]:
df_set_electronics_tables.set_index('table_id').describe().T.round()

In [None]:
df_75_electronics_tables=df_set_electronics_tables[df_set_electronics_tables['Amount']>75]
df_75_electronics_tables

In [None]:
df_75_electronics_tables.set_index('table_id').describe().T.round()

In [None]:
df_75_electronics_tables.plot(x='table_id',y='Amount',kind='hist')

In [None]:
df_150_electronics_tables=df_set_electronics_tables[df_set_electronics_tables['Amount']>150]
df_150_electronics_tables

In [None]:
df_150_electronics_tables.set_index('table_id').describe().T.round()

In [None]:
df_150_electronics_tables.plot(x='table_id',y='Amount',kind='hist')

In [None]:
df_joined_clothes

In [None]:
df_grouped_clothes_tables = df_joined_clothes.groupby('table_id').count()

In [None]:
# only look at clusters that have at least one brand associated
df_set_clothes_tables = df_grouped_clothes_tables[df_grouped_clothes_tables['brand']>0].reset_index()[['cluster_id','table_id']].rename(columns={'cluster_id':'Amount'})

In [None]:
df_set_clothes_tables.set_index('table_id').describe().T.round()

In [None]:
df_75_clothes_tables=df_set_clothes_tables[df_set_clothes_tables['Amount']>75]
df_75_clothes_tables

In [None]:
df_75_clothes_tables.set_index('table_id').describe().T.round()

In [None]:
df_75_clothes_tables.plot(x='table_id',y='Amount',kind='hist')

In [None]:
df_150_clothes_tables=df_set_clothes_tables[df_set_clothes_tables['Amount']>150]
df_150_clothes_tables

In [None]:
df_150_clothes_tables.set_index('table_id').describe().T.round()

In [None]:
df_150_clothes_tables.plot(x='table_id',y='Amount',kind='hist')

In [4]:
df_large = pd.read_json(os.path.join(mapping_corpus_path_2, 'df_large_matched.json'), compression='gzip', orient='records', lines=True)

# Use the defined approach for cleaning and then joining all information to every single set of different categories to save them for later use

In [5]:
# get dictionaries
electronics_path = '../src/data/product/product_electronics_v3/electronics_dict.json'
clothes_path = '../src/data/product/product_clothes_v3/clothes_dict.json'
bikes_path = '../src/data/product/product_bikes/bikes_dict.json'
cars_path = '../src/data/product/product_cars/cars_dict.json'
drugstore_path = '../src/data/product/product_drugstore/drugstore_dict.json'
technology_path = '../src/data/product/product_technology/technology_dict.json'
tools_path = '../src/data/product/product_tools/tools_dict.json'
mapping_corpus_path_2 = '../src/data/product/lspc2020_to_tablecorpus/Cleaned'

In [12]:
with open(electronics_path) as f:
    electronics_data=json.load(f)

In [13]:
with open(clothes_path) as f:
    clothes_data=json.load(f)

In [14]:
with open(bikes_path) as f:
    bikes_data=json.load(f)

In [15]:
with open(cars_path) as f:
    cars_data=json.load(f)

In [16]:
with open(drugstore_path) as f:
    drugstore_data=json.load(f)

In [17]:
with open(technology_path) as f:
    technology_data=json.load(f)

In [18]:
with open(tools_path) as f:
    tools_data=json.load(f)

In [19]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_electronics={}
for value in electronics_data.values():
    cleaned_dictionary_electronics.update(value)

In [20]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_clothes={}
for value in clothes_data.values():
    cleaned_dictionary_clothes.update(value)

In [21]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_bikes={}
for value in bikes_data.values():
    cleaned_dictionary_bikes.update(value)

In [22]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_cars={}
for value in cars_data.values():
    cleaned_dictionary_cars.update(value)

In [23]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_drugstore={}
for value in drugstore_data.values():
    cleaned_dictionary_drugstore.update(value)

In [24]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_technology={}
for value in technology_data.values():
    cleaned_dictionary_technology.update(value)

In [25]:
#clean the dictionaries by getting rid of the first key 
cleaned_dictionary_tools={}
for value in tools_data.values():
    cleaned_dictionary_tools.update(value)

In [26]:
#put the dictionaries into dataframes 
df_electronics=pd.DataFrame.from_dict(cleaned_dictionary_electronics, orient='index')
df_clothes=pd.DataFrame.from_dict(cleaned_dictionary_clothes, orient='index')
df_bikes=pd.DataFrame.from_dict(cleaned_dictionary_bikes, orient='index')
df_cars=pd.DataFrame.from_dict(cleaned_dictionary_cars, orient='index')
df_drugstore=pd.DataFrame.from_dict(cleaned_dictionary_drugstore, orient='index')
df_technology=pd.DataFrame.from_dict(cleaned_dictionary_technology, orient='index')
df_tools=pd.DataFrame.from_dict(cleaned_dictionary_tools, orient='index')

In [27]:
#fill up missing values in both product category data frames to be able to split the tuples up
df_electronics_filtered=df_electronics.applymap(lambda x: [0,0] if x is None else x)
df_clothes_filtered=df_clothes.applymap(lambda x: [0,0] if x is None else x)
#fill up missing values in both product category data frames to be able to split the tuples up
df_bikes_filtered=df_bikes.applymap(lambda x: [0,0] if x is None else x)
df_cars_filtered=df_cars.applymap(lambda x: [0,0] if x is None else x)
#fill up missing values in both product category data frames to be able to split the tuples up
df_drugstore_filtered=df_drugstore.applymap(lambda x: [0,0] if x is None else x)
df_technology_filtered=df_technology.applymap(lambda x: [0,0] if x is None else x)
#fill up missing values in both product category data frames to be able to split the tuples up
df_tools_filtered=df_tools.applymap(lambda x: [0,0] if x is None else x)

In [8]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_electronics_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_electronics_filtered.columns)) as bar:
    for i in range(len(df_electronics_filtered.columns)):
        df_electronics_cleaned = df_electronics_cleaned.append(pd.DataFrame(df_electronics_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_electronics_filtered.index))
        count += 1
        bar.update(count)

In [None]:
#clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_clothes_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_clothes_filtered.columns)) as bar:
    for i in range(len(df_clothes_filtered.columns)):
        df_clothes_cleaned = df_clothes_cleaned.append(pd.DataFrame(df_clothes_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_clothes_filtered.index))
        count += 1
        bar.update(count)

In [29]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_bikes_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_bikes_filtered.columns)) as bar:
    for i in range(len(df_bikes_filtered.columns)):
        df_bikes_cleaned = df_bikes_cleaned.append(pd.DataFrame(df_bikes_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_bikes_filtered.index))
        count += 1
        bar.update(count)

100% (15710 of 15710) |##################| Elapsed Time: 0:10:06 Time:  0:10:06


In [30]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_cars_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_cars_filtered.columns)) as bar:
    for i in range(len(df_cars_filtered.columns)):
        df_cars_cleaned = df_cars_cleaned.append(pd.DataFrame(df_cars_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_cars_filtered.index))
        count += 1
        bar.update(count)

100% (82282 of 82282) |##################| Elapsed Time: 3:26:42 Time:  3:26:42


In [31]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_drugstore_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_drugstore_filtered.columns)) as bar:
    for i in range(len(df_drugstore_filtered.columns)):
        df_drugstore_cleaned = df_drugstore_cleaned.append(pd.DataFrame(df_drugstore_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_drugstore_filtered.index))
        count += 1
        bar.update(count)

100% (7707 of 7707) |####################| Elapsed Time: 0:01:35 Time:  0:01:35


In [7]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_technology_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_technology_filtered.columns)) as bar:
    for i in range(len(df_technology_filtered.columns)):
        df_technology_cleaned = df_technology_cleaned.append(pd.DataFrame(df_technology_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_technology_filtered.index))
        count += 1
        bar.update(count)

In [None]:
# clean up the tables
#split up tuples in in each column for each brand into two different columns table_id and row_id and concatente these rows
df_tools_cleaned=pd.DataFrame(columns=['table_id', 'row_id'])
count = 0
with progressbar.ProgressBar(max_value=len(df_tools_filtered.columns)) as bar:
    for i in range(len(df_tools_filtered.columns)):
        df_tools_cleaned = df_tools_cleaned.append(pd.DataFrame(df_tools_filtered[i].tolist(),columns=['table_id', 'row_id'], index=df_tools_filtered.index))
        count += 1
        bar.update(count)

In [33]:
#rename the columns to be able to join them into the cluster_id table
df_electronics_cleaned=df_electronics_cleaned.reset_index().rename(columns={'index':"brand"})
df_clothes_cleaned=df_clothes_cleaned.reset_index().rename(columns={'index':"brand"})
df_bikes_cleaned=df_bikes_cleaned.reset_index().rename(columns={'index':"brand"})
df_cars_cleaned=df_cars_cleaned.reset_index().rename(columns={'index':"brand"})
df_drugstore_cleaned=df_drugstore_cleaned.reset_index().rename(columns={'index':"brand"})
df_technology_cleaned=df_technology_cleaned.reset_index().rename(columns={'index':"brand"})
df_tools_cleaned=df_tools_cleaned.reset_index().rename(columns={'index':"brand"})

In [34]:
df_joined_electronics= df_large.merge(df_electronics_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')
df_joined_clothes = df_large.merge(df_clothes_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')
df_joined_bikes = df_large.merge(df_bikes_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')
df_joined_cars = df_large.merge(df_cars_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')
df_joined_drugstore = df_large.merge(df_drugstore_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')
df_joined_technology = df_large.merge(df_technology_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')
df_joined_tools = df_large.merge(df_tools_cleaned, left_on=['table_id','row_id'], right_on = ['table_id','row_id'], how='left')

In [38]:
df_joined_electronics.to_json(mapping_corpus_path_2 + '/joined_electronics_v3.json', compression='gzip', orient='records', lines=True)
df_joined_clothes.to_json(mapping_corpus_path_2 + '/joined_clothes_v3.json', compression='gzip', orient='records', lines=True)
df_joined_bikes.to_json(mapping_corpus_path_2 + '/joined_bikes.json', compression='gzip', orient='records', lines=True)
df_joined_cars.to_json(mapping_corpus_path_2 + '/joined_cars.json', compression='gzip', orient='records', lines=True)
df_joined_drugstore.to_json(mapping_corpus_path_2 + '/joined_drugstore.json', compression='gzip', orient='records', lines=True)
df_joined_technology.to_json(mapping_corpus_path_2 + '/joined_technology.json', compression='gzip', orient='records', lines=True)
df_joined_tools.to_json(mapping_corpus_path_2 + '/joined_tools.json', compression='gzip', orient='records', lines=True)