### Import Libraries

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Creating a Dataset

In [2]:
data1 = {
    "ProductID": [1, 2, 3, 4, 5],
    # "ProductName": ["Apple iPhone 12", "Samsung Galaxy S21", "Sony WH-1000XM4", "Dell XPS 13", "Apple MacBook Air"],
    "Category": ["Electronics", "Electronics", "Electronics", "Computers", "Computers"],
    "Price": [799, 999, 349, 1199, 999],
    "Manufacturer": ["Apple Inc.", "Samsung", "Sony Corporation", "Dell", "Apple Inc."]
}
data2 = {
    "ID": ["A001", "A002", "A003", "A004", "A005"],
    "Name": ["iPhone 12", "Galaxy S21", "WH-1000XM4", "XPS 13 Laptop", "MacBook Air 13"],
    "Type": ["Mobile Phone", "Mobile Phone", "Headphones", "Laptop", "Laptop"],
    "Cost": [799, 999, 350, 1200, 999],
    "Brand": ["Apple", "Samsung", "Sony", "Dell Computers", "Apple Computers"]
}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

In [3]:
df1.head()

Unnamed: 0,ProductID,Category,Price,Manufacturer
0,1,Electronics,799,Apple Inc.
1,2,Electronics,999,Samsung
2,3,Electronics,349,Sony Corporation
3,4,Computers,1199,Dell
4,5,Computers,999,Apple Inc.


### Rearraning df2 records to make it messy

In [4]:
desired_order_df2 = [0, 3, 2, 4, 1] 

# Rearrange rows in df1
df2 = df2.iloc[desired_order_df2].reset_index(drop = True)
df2

Unnamed: 0,ID,Name,Type,Cost,Brand
0,A001,iPhone 12,Mobile Phone,799,Apple
1,A004,XPS 13 Laptop,Laptop,1200,Dell Computers
2,A003,WH-1000XM4,Headphones,350,Sony
3,A005,MacBook Air 13,Laptop,999,Apple Computers
4,A002,Galaxy S21,Mobile Phone,999,Samsung


### Entity Matching

In [5]:
# df1['soup1'] = df1[['ProductName', 'Category', 'Price','Manufacturer']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)
df1['soup1'] = df1[['Category', 'Price','Manufacturer']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)

In [6]:
df2['soup2'] = df2[['Name', 'Type', 'Cost','Brand']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)

In [7]:
df1['soup1'] = df1['soup1'].apply(lambda x: x.lower())
df2['soup2'] = df2['soup2'].apply(lambda x: x.lower())

In [8]:
combined_soup = pd.concat([df1['soup1'], df2['soup2']])

count = CountVectorizer(stop_words='english')
count.fit(combined_soup)

count_matrix_df1 = count.transform(df1['soup1'])
count_matrix_df2 = count.transform(df2['soup2'])

similarity = cosine_similarity(count_matrix_df1, count_matrix_df2)
similarity_df = pd.DataFrame(similarity, index = df1.index, columns = df2.index)
similarity_df

Unnamed: 0,0,1,2,3,4
0,0.471405,0.0,0.0,0.218218,0.0
1,0.0,0.0,0.0,0.218218,0.471405
2,0.0,0.0,0.223607,0.0,0.0
3,0.0,0.3849,0.0,0.218218,0.0
4,0.235702,0.19245,0.0,0.654654,0.235702


In [9]:
max_idx_row = similarity_df.idxmax(axis = 1)
print(max_idx_row)

0    0
1    4
2    2
3    1
4    3
dtype: int64


In [10]:
combined_df = pd.DataFrame({
    # 'df1_ProductID': df1['ProductID'].values,
    'soup1': df1['soup1'].values,
    # 'df2_ID': df2.loc[max_idx_row.values, 'ID'].values,
    'soup2': df2.loc[max_idx_row.values, 'soup2'].values
})

combined_df

Unnamed: 0,soup1,soup2
0,electronics 799 apple inc.,iphone 12 mobile phone 799 apple
1,electronics 999 samsung,galaxy s21 mobile phone 999 samsung
2,electronics 349 sony corporation,wh-1000xm4 headphones 350 sony
3,computers 1199 dell,xps 13 laptop laptop 1200 dell computers
4,computers 999 apple inc.,macbook air 13 laptop 999 apple computers


In [11]:
result = pd.merge(pd.merge(df1, combined_df, on = 'soup1', how = 'left'), df2, on = 'soup2', how = 'inner').drop(['soup1', 'soup2'], axis =1)
result

Unnamed: 0,ProductID,Category,Price,Manufacturer,ID,Name,Type,Cost,Brand
0,1,Electronics,799,Apple Inc.,A001,iPhone 12,Mobile Phone,799,Apple
1,2,Electronics,999,Samsung,A002,Galaxy S21,Mobile Phone,999,Samsung
2,3,Electronics,349,Sony Corporation,A003,WH-1000XM4,Headphones,350,Sony
3,4,Computers,1199,Dell,A004,XPS 13 Laptop,Laptop,1200,Dell Computers
4,5,Computers,999,Apple Inc.,A005,MacBook Air 13,Laptop,999,Apple Computers
