In [1]:
import pandas as pd #data manipulation, this is used to declare panda variables
import numpy as np #numeric python, used to convert numbers to strings, etc.
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
from sklearn.feature_extraction.text import TfidfVectorizer #TFID used to convert string into vectors, term frequency
from flask import Flask, jsonify
import sys
import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None


In [2]:
#Exception Handling
class InvalidUsage(Exception):
    status_code = 400
    def __init__(self, message, status_code=None, payload=None):
        Exception.__init__(self)
        self.message = message
        if status_code is not None:
            self.status_code = status_code
        self.payload = payload

    def to_dict(self):
        rv = dict(self.payload or ())
        rv['message'] = self.message
        return rv

def handle_invalid_usage(error):
    response = jsonify(error.to_dict())
    response.status_code = error.status_code
    return response

In [3]:
def cossim(A, B, ntop, lower_bound):
    try:
        # force A and B as a CSR matrix.
        # If they have already been CSR, there is no overhead
        A = A.tocsr()
        B = B.tocsr()
        M, _ = A.shape
        _, N = B.shape
        if lower_bound == 1:
            lower_bound -=0.01

        idx_dtype = np.int32

        nnz_max = M*ntop
        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        if len(A.indices) > 0 and len(A.data) > 0 and len(A.indptr) > 0 and            len(B.indices) > 0 and len(B.data) > 0 and len(B.indptr) > 0:

            ct.sparse_dot_topn(
                M, N, 
                np.asarray(A.indptr, dtype=idx_dtype),
                np.asarray(A.indices, dtype=idx_dtype),
                A.data,
                np.asarray(B.indptr, dtype=idx_dtype),
                np.asarray(B.indices, dtype=idx_dtype),
                B.data,
                ntop,
                lower_bound,
                indptr, indices, data)

        return csr_matrix((data,indices,indptr),shape=(M,N))
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        e = "Error Type - " +str(exc_type)+ ", Error - " +str(e)+ ", Line No. - " +str(exc_tb.tb_lineno)
        print(e)
        raise InvalidUsage(str(e),status_code=500)
        
    

In [4]:
def ngrams(string, n=2):
    try:
        # Adding 0 for single value as padding, so it can pass bigram condition
        if len(string) <= 1:
            string = string + " 0"
        ngrams = zip(*[string[i:] for i in range(n)])
        return [''.join(ngram) for ngram in ngrams]
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        e = "Error Type - " +str(exc_type)+ ", Error - " +str(e)+ ", Line No. - " +str(exc_tb.tb_lineno)
        print(e)
        raise InvalidUsage(str(e),status_code=500)
        

In [5]:
def get_matches_df(sparse_matrix, name_vector, li,top=100):
    try:
        non_zeros = sparse_matrix.nonzero()

        sparserows = non_zeros[0]
        sparsecols = non_zeros[1]


        if top:
            nr_matches = top
        else:
            nr_matches = sparsecols.size

        #left_side = np.empty([nr_matches], dtype=object)
        left_index = np.empty([nr_matches], dtype=object)
        #right_side = np.empty([nr_matches], dtype=object)
        right_index = np.empty([nr_matches], dtype=object)
        similairity = np.zeros(nr_matches)

        for index in range(0, nr_matches):
            #left_side[index] = name_vector[sparserows[index]]
            left_index[index] = sparserows[index]
            #right_side[index] = li[sparsecols[index]]
            right_index[index] = sparsecols[index]
            similairity[index] = sparse_matrix.data[index] * 100
            if similairity[index].astype(int) <= 99:
                similairity[index] = similairity[index].astype(int) + 1
            else:
                similairity[index] = similairity[index].astype(int)

        return pd.DataFrame({#'left_side': left_side,
                             'left_index': left_index,
                             #'right_side': right_side,
                             'right_index': right_index,
                             'similairity': similairity})
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        e = "Error Type - " +str(exc_type)+ ", Error - " +str(e)+ ", Line No. - " +str(exc_tb.tb_lineno)
        print(e)
        raise InvalidUsage(str(e),status_code=500)
        
   

In [6]:
def vec(left,right):
    try:
        vectorizer = TfidfVectorizer(min_df=0,analyzer='char',ngram_range=(1,2),sublinear_tf=True,use_idf=False)
        tf_idf_matrix_left = vectorizer.fit_transform(left)
        tf_idf_matrix_right = vectorizer.transform(right).transpose()
        return [tf_idf_matrix_left,tf_idf_matrix_right]
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        e = "Error Type - " +str(exc_type)+ ", Error - " +str(e)+ ", Line No. - " +str(exc_tb.tb_lineno)
        print(e)
        raise InvalidUsage(str(e),status_code=500)
        
        

In [7]:
def cosine_inputfuntion(left_input,right_input,conf):
    try:
        vec_result = vec(left_input,right_input)
        #matches = cossim(vec_result[0], vec_result[1], len(right_input) + 1, conf/100)
        matches = cossim(vec_result[0], vec_result[1], len(right_input) + 1, conf/100)
        top = matches.nonzero()
        top = len(top[0])
        matches_df = get_matches_df(matches, left_input, right_input, top=top)
        return matches_df
    except Exception as e:
        exc_type, exc_obj, exc_tb = sys.exc_info()
        e = "Error Type - " +str(exc_type)+ ", Error - " +str(e)+ ", Line No. - " +str(exc_tb.tb_lineno)
        print(e)
        raise InvalidUsage(str(e),status_code=500)



In [8]:
Data1 = pd.read_csv("C:/Users/harikamu/Desktop/Datamatch/FashionDataset.csv")
Data2 = pd.read_csv("C:/Users/harikamu/Desktop/Datamatch/myntra_products_catalog.csv")

In [9]:
df_tmp = pd.DataFrame()
df_tmp1 = pd.DataFrame()
mt_pct = 80
main_df = Data1[Data1['Deatils'].notna()].reset_index(drop=True)
if len(main_df) >= 5: 
    main_df = np.array_split(main_df, 5)
else:
    main_df = np.array_split(main_df, len(main_df))
left_input = Data2['Description']

while len(main_df) != 0:
    right_input = main_df[0].reset_index(drop=True)
    uni_df_score1 = cosine_inputfuntion(left_input,right_input['Deatils'],int(mt_pct))
    uni_df_score1 = pd.merge(uni_df_score1,right_input['Deatils'],how='left',left_on='right_index',right_on=right_input.index)

    uni_df_score2 = cosine_inputfuntion(right_input['Deatils'],left_input,int(mt_pct))
    uni_df_score2 = pd.merge(uni_df_score2,right_input['Deatils'],how='left',left_on='left_index',right_on=right_input.index)

    df_tmp = df_tmp.append(uni_df_score1)
    df_tmp1 = df_tmp1.append(uni_df_score2)
    main_df.pop(0)
df_tmp = pd.merge(df_tmp,df_tmp1[['right_index','left_index','similairity']],how='left',left_on=['left_index','right_index'],right_on=['right_index','left_index'])
df_tmp = df_tmp[~df_tmp['similairity_y'].isna()]
df_tmp['similairity'] = np.where((df_tmp['similairity_x'] <= df_tmp['similairity_y'])
                , df_tmp['similairity_x'], df_tmp['similairity_y'])
df_tmp.rename(columns={'left_index_x':'left_index'}, inplace=True)
df_tmp = pd.merge(df_tmp,left_input,how='left',left_on='left_index',right_on=left_input.index)[['Deatils','Description','similairity']]
df_tmp = df_tmp.sort_values('similairity',ascending=False)

In [10]:
df_tmp

Unnamed: 0,Deatils,Description,similairity
7093,1981 indigo w eau de toilette 100ml,1981 Indigo Eau de Toilette,92.0
1118,womens round neck printed t-shirt - black,Black printed T-shirt and has a round neck,89.0
1119,womens round neck printed t-shirt - black,Black printed T-shirt and has a round neck,89.0
1231,womens round neck printed t-shirt - black,White and Black printed T-shirt and has a roun...,88.0
3667,womens round neck printed t-shirt - anthra,Black printed T-shirt and has a round neck,88.0
...,...,...,...
1571,printed viscose blend round neck womens t-shir...,"White and black printed T-shirt, has a round n...",81.0
3992,printed mandarin collar straight womens kurta ...,"Black and Orange printed straight kurta, has a...",81.0
3993,printed mandarin collar straight fit womens ku...,"Black and Orange printed straight kurta, has a...",81.0
3994,printed mandarin collar straight fit womens ku...,"Black and Orange printed straight kurta, has a...",81.0
