In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
from nltk.stem.porter import *
stemmer = PorterStemmer()
# Import pandas 
import numpy as np 
import pandas as pd

# Import time to measure how long processing takes
import time

# Import sklearn modules for model building
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer

# Import the Snowball stemmer for stemming operations
from nltk.stem.snowball import SnowballStemmer

# Import regex for cleaning and other data processng
import re

# TO see stop words 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer # Root / Base Words.
import random
from collections import Counter


import spacy
from spacy import displacy 
from collections import Counter
# Import matplotlib colors for color identification in queries
from matplotlib import colors as mcolors
import matplotlib.pyplot as plt

# Import requests and json for search requests to google
import requests, json

# Import StringIO to handle the attributes file formatting errors on import
from io import StringIO

In [None]:
att = pd.read_csv('../input/significance-measure-of-product-search/attributes.csv')
desc = pd.read_csv('../input/significance-measure-of-product-search/product_descriptions.csv')
test = pd.read_csv('../input/significance-measure-of-product-search/test.csv', encoding='ISO-8859-1')
train = pd.read_csv('../input/significance-measure-of-product-search/train.csv', encoding='ISO-8859-1')
n_train = len(train)

In [None]:
%matplotlib ipympl

import matplotlib.pyplot as plt

**Finding Null Values**

In [None]:
for col in att.columns:
  print('{} - {} null values'.format(col, att[col].isna().values.sum()))

##Need to remove Null values from product_uid & Name##

In [None]:
for col in desc.columns:
  print('{} - {} null values'.format(col, desc[col].isna().values.sum()))

In [None]:
for col in train.columns:
  print('{} - {} null values'.format(col, train[col].isna().values.sum()))

In [None]:
for col in test.columns:
  print('{} - {} null values'.format(col, test[col].isna().values.sum()))

**Train data set**

In [None]:
display(train.info(),train.relevance.hist(color='c', alpha=0.5))
plt.show() 
train.relevance.value_counts()

In [None]:
display(train.head(10)) 
print('Number of different product_uid in train set', train.product_uid.nunique())


**Product descriptions data set**

In [None]:
display(desc.info())

In [None]:
display(desc.head(10))

**Attributes data set**

In [None]:
display(att.info())
print('number of different product_uid in the attributes set:', att.product_uid.nunique())

In [None]:
display(att.head(10))

In [None]:
att.name.value_counts()

**Test Data Set**

In [None]:
test.info()
print('Number of different product_uid in the test set:', test.product_uid.nunique())

## Data Cleaning

In [None]:
#Set the maximum number of columns that can be displayed to make debugging easier
import pandas as pd
pd.set_option('display.max_columns', 50)

In [None]:
#First, we will separate the brand data from the attribute set:
brand = att[att.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"}) 
print("Data about the brand (brand) is extracted from the attribute set: ")
brand.head()

**Merging the Attributes, Description, and the Brand feature/Concatenate the attribute names and values in attributes csv and merge using left join with the train CSV.*

In [None]:
def merge_attributes(df):
  product_uids = df['product_uid'].values
  temp = att.loc[att['product_uid'].isin(product_uids)].fillna('')  
  temp['name_value'] = temp['name'] + ' ' + temp['value']
  temp['combined_att'] = temp.groupby(['product_uid'])['name_value'].transform(lambda x: ' '.join(x))
  temp = temp.drop_duplicates('product_uid')[['product_uid', 'combined_att']]
  df = pd.merge(df, temp, on='product_uid', how='left').set_index(df.index)
  return df

train1= merge_attributes(train)
print(train1.shape)
train1.head()

In [None]:
def merge_brand(df):
  product_uids = df['product_uid'].values
  temp = att.loc[att['product_uid'].isin(product_uids)]  
  brands = temp[temp['name']=='MFG Brand Name']
  brands_temp = brands[['product_uid','value']]
  df = pd.merge(df, brands_temp, on='product_uid', how='left').set_index(df.index)
  df.rename(columns = {'value':'brand'}, inplace = True) 
  return df

train1 = merge_brand(train)
print(train1.shape)
train1.head()

In [None]:
def merge_description(df):
  df = pd.merge(df, desc, on='product_uid', how='left').set_index(df.index)
  #an extra preprocessing step is performed to seperate the concatenated words in the description. 
  df['product_description'] = df['product_description'].apply(lambda x: ' '.join(re.findall(r'[A-Z]?[^A-Z\s]+|[A-Z]+', x)))
  return df

train1 = merge_description(train)
print(train1.shape)
train1.head()

In [None]:
train1= merge_attributes(train)
train1 = merge_brand(train1)
train1 = merge_description(train1)
print(train1.shape)
train1.head()

**Filling Null Values**

In [None]:
for col in train1.columns:
  print('{} - {} null values'.format(col, train1[col].isna().values.sum()))

In [None]:
unique_brands = np.unique(train1['brand'].dropna().values)

## Filling null vallues

In [None]:
def first_n(n, sent):
  if n > len(sent.split()):
    return 'error101'
  return ' '.join(sent.split()[:n])

def fillna_brand(data, unique_brands):
  null_df = data[data['brand'].isnull()]
  notnull_df = data.dropna()

  for i, row in null_df.iterrows():
    title = row['product_title']
    if first_n(4, title) in unique_brands:
      null_df['brand'].loc[i] = first_n(4, title)
    elif first_n(3, title) in unique_brands:
      null_df['brand'].loc[i] = first_n(3, title)
    elif first_n(2, title) in unique_brands:
      null_df['brand'].loc[i] = first_n(2, title)
    else:
      null_df['brand'].loc[i] = first_n(1, title)

  data['brand'].loc[null_df.index] = null_df['brand'].values
  return data

def fillna_attributes(data):
  null_df = data[data['combined_att'].isnull()]
  null_df['combined_att'] = null_df['product_description'].copy()
  data['combined_att'].loc[null_df.index] = null_df['combined_att'].values
  return data

train1 = fillna_brand(train1, unique_brands)
train1 = fillna_attributes(train1)

In [None]:
#to check null values
for col in train1.columns:
  print('{} - {} null values'.format(col, train1[col].isna().values.sum()))
print(train1.shape)
train1.head()

In [None]:
import re

In [None]:
df=train1
df.head()

In [None]:
# Write a regular expression to extract both lower and uppercase 
df.dtypes

In [None]:
# Create a function to remove digits,special character 
def string_edit(s:str): 
    if isinstance(s, str):
        s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s) 
        s = s.lower()
        s = s.replace("  "," ")
        s = s.replace(",","") 
        s = s.replace("$"," ")
        s = s.replace("?"," ")
        s = s.replace("-"," ")
        s = s.replace("//","/")
        s = s.replace("..",".")
        #s = s.replace(" / "," ") #it will convert fractional nu. to whole nu.
        s = s.replace(" \\ "," ")
        s = s.replace("."," . ")
        #s = re.sub(r"(^\.|/)", r"", s)
        s = re.sub(r"(\.|/)$", r"", s)
        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
        #s = s.replace(" x "," xbi ")
        s = re.sub(r"([a-z])( *)\.( *)([a-z])", r"\1 \4", s)
        s = re.sub(r"([a-z])( *)/( *)([a-z])", r"\1 \4", s)
        s = s.replace("*"," x ")
        s = s.replace(" by "," x ") #search term (100047)
        s = re.sub(r"([0-9])( *)\.( *)([0-9])", r"\1.\4", s)
        
        # Consolidate variations of equivalent unit terms 
        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
        s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(cubic|cu) ?\.?(feet|foot|ft)\.?", r"\1cu.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
        s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
        s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
        s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
        s = s.replace("°"," degrees ")
        s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
        s = s.replace(" v "," volts ")
        s = re.sub(r"([0-9]+)( *)(volt\|volt)\.?", r"\1volt. ", s)
        s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
        s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)
        s = s.replace("  "," ")
        s = s.replace(" . "," ")
        
        # Handling numeric instances with common identifiers
        s = re.sub(r"zero\.?", r"0 ", s)
        s = re.sub(r"one\.?", r"1 ", s)
        s = re.sub(r"two\.?", r"2 ", s)
        s = re.sub(r"three\.?", r"3 ", s)
        s = re.sub(r"four\.?", r"4 ", s)
        s = re.sub(r"five\.?", r"5 ", s)
        s = re.sub(r"six\.?", r"6 ", s)
        s = re.sub(r"seven\.?", r"7 ", s)
        s = re.sub(r"eight\.?", r"8 ", s)
        s = re.sub(r"nine\.?", r"9 ", s)
        
        return s
    else:
        # Return a "null" string if the parameter supplied is not a string
        return "null"   

In [None]:
df['product_description'] = df['product_description'].apply(string_edit).str.lower()
df['product_title'] = df['product_title'].apply(string_edit).str.lower()
df['combined_att'] = df['combined_att'].apply(string_edit).str.lower()
df['brand'] = df['brand'].apply(string_edit).str.lower()
df.head()

In [None]:
from textblob import TextBlob
from textblob import Word
#from textblob.wordnet import VERB
from textblob.classifiers import NaiveBayesClassifier
import nltk 
nltk.download('brown')
!pip install pyspellchecker
from spellchecker import SpellChecker
spell = SpellChecker()
from nltk.corpus import stopwords # Import the stop word list
#stop_w = set(stopwords.words('english'))
from bs4 import BeautifulSoup

In [None]:
print('Total {} html tags contains in product description'.format(df.product_description.str.count('<br$').values.sum()))
print('Total {} html tags contains in combine_attr'.format(df.combined_att.str.count('<br$').values.sum()))
# print('Total {} html tags contains in producttile'.format(train2.product_title.str.count('<br$').values.sum()))
# print('Total {} html tags contains in term'.format(train2.search_term.str.count('<br$').values.sum()))

In [None]:
df[df.product_description.str.contains("<br")].values.tolist()[-2:] #check the html link

In [None]:
# use Beautifulsoup lib to remove html tags in text
def remove_html_tag(text):
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text().replace('Click here to review our return policy for additional information regarding returns', '')
    return text

In [None]:
 #apply the remove html fun for prod_desc and attri (101,13)
df['product_description'] = df['product_description'].apply(remove_html_tag)
df['combined_att'] = df['combined_att'].apply(remove_html_tag)

## Removing stop words and lemmatisation

In [None]:
import spacy

# disable chained assignments
pd.options.mode.chained_assignment = None

## Removal

In [None]:
# !python -m spacy download en_core_web_sm (One time)
import spacy
from spacy import displacy 
from collections import Counter
import en_core_web_sm # en --> English pre-trained 
# sm --> Small Model || md --> Medium Model || lg --> Large Model
# Loading large model in python:  --> gensim, tokenize & then lemmi.
nlp = spacy.load('en_core_web_sm',disable=['parser','ner'])

In [None]:
stopwords = nlp.Defaults.stop_words
print(len(stopwords))
#stopwords.remove('not')
#stopwords.update('not only')

In [1]:
stopwords.update('bullet 01')
stopwords.update('bullet 02')
stopwords.update('bullet 03')
stopwords.update('bullet 04')
stopwords.update('bullet 05')
stopwords.update('bullet 06')
stopwords.update('bullet 07')
stopwords.update('bullet 08')
stopwords.update('bullet 09')
stopwords.update('bullet 10')
stopwords.update('bullet 11')
stopwords.update('bullet 12')
stopwords.update('bullet 13')
stopwords.update('bullet 14')
stopwords.update('bullet 15')
stopwords.update('bullet 16')
stopwords.update('bullet 17')
stopwords.update('bullet 18')
stopwords.update('bullet 19')
stopwords.update('bullet 20')
stopwords.update('bullet 21')
stopwords.update('bullet 22')
stopwords.update('bullet 23')
stopwords.update('bullet 24')

NameError: name 'stopwords' is not defined

## run only once
stopwords.update('bullet 01')
stopwords.update('bullet 02')
stopwords.update('bullet 03')
stopwords.update('bullet 04')
stopwords.update('bullet 05')

stopwords.update('bullet 06')
stopwords.update('bullet 07')
stopwords.update('bullet 08')
stopwords.update('bullet 09')
stopwords.update('bullet 10')
stopwords.update('bullet 11')
stopwords.update('bullet 12')
stopwords.update('bullet 13')
stopwords.update('bullet 14')
stopwords.update('bullet 15')
stopwords.update('bullet 16')
stopwords.update('bullet 17')
stopwords.update('bullet 18')
stopwords.update('bullet 19')
stopwords.update('bullet 20')
stopwords.update('bullet 21')
stopwords.update('bullet 22')

In [None]:
print(len(stopwords))

In [None]:
stop = set(stopwords)

In [None]:
!conda install -c asmeurer pattern -y
import pattern
from pattern.en import lemma

In [None]:
def clean(x):
    result = x.lower()
    result = result.split()
    result = " ".join([lemma(word) for word in result
                     if word not in stop])
    return result

In [None]:
df['product_description'] = df['product_description'].apply(clean)
df['product_title'] = df['product_title'].apply(clean)
df['combined_att'] = df['combined_att'].apply(clean)
df['brand'] = df['brand'].apply(clean)

In [None]:
df.head(30)

## Jacardian distance

In [None]:
import keras as kr
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.layers import *
import spacy
import re
import math 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [None]:
pip install textdistance

In [None]:
import textdistance
#with description
df["jaccard_similar_desc"] = [textdistance.jaccard(df["search_term"][i], df["product_description"][i]) for i in range(0, len(df))]
df["levenshtein_similarz_desc"] = [textdistance.levenshtein(df["search_term"][i], df["product_description"][i]) for i in range(0, len(df))]
#with title
df["jaccard_similar_title"] = [textdistance.jaccard(df["search_term"][i], df["product_title"][i]) for i in range(0, len(df))]
df["levenshtein_similarz_title"] = [textdistance.levenshtein(df["search_term"][i], df["product_title"][i]) for i in range(0, len(df))]
#with brand
df["jaccard_similar_brand"] = [textdistance.jaccard(df["search_term"][i], df["brand"][i]) for i in range(0, len(df))]
df["levenshtein_similarz_brand"] = [textdistance.levenshtein(df["search_term"][i], df["brand"][i]) for i in range(0, len(df))]
#with combined attr
df["jaccard_similar_CA"] = [textdistance.jaccard(df["search_term"][i], df["combined_att"][i]) for i in range(0, len(df))]
df["levenshtein_similarz_CA"] = [textdistance.levenshtein(df["search_term"][i], df["combined_att"][i]) for i in range(0, len(df))]

In [None]:
df.head()

In [None]:
features_le = df.copy()
features_le.to_csv(r'/kaggle/working/features_ht.csv', index=False)

## Text Embedding

In [None]:
import xgboost
import gensim
from time import time
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess, tokenize
from nltk.corpus import brown
embed_model = gensim.models.Word2Vec(brown.sents())
embed_model.save('brown.embedding')
model = gensim.models.Word2Vec.load('brown.embedding')

In [None]:
%%time
def embeding_similarity_calculator(s, t, i):
    _sum = 0
    avg = 0
    if len(s.split()) == 0 :
        return 0
    for s_word in s.split():
        _max = 0
        for t_word in t.split():
            if ((s_word in model.wv) and (t_word in model.wv)):
                _max = max(_max, model.wv.similarity(s_word, t_word))
        _sum += _max
    avg = _sum/ len(s.split())
    return avg
df["word_ebed_similarity_pd"] = [embeding_similarity_calculator(df["search_term"][i], df["product_description"][i], i) for i in range(0, len(df))]

In [None]:
df.head()

In [None]:
df["word_ebed_similarity_brand"] = [embeding_similarity_calculator(df["search_term"][i], df["brand"][i], i) for i in range(0, len(df))]

In [None]:
df["word_ebed_similarity_combiatt"] = [embeding_similarity_calculator(df["search_term"][i], df["combined_att"][i], i) for i in range(0, len(df))]

In [None]:
df["word_ebed_similarity_pt"] = [embeding_similarity_calculator(df["search_term"][i], df["product_title"][i], i) for i in range(0, len(df))]

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vect = TfidfVectorizer(analyzer='char_wb', ngram_range = (3,3), max_features = 1500)
tfidf_des = tfidf_vect.fit_transform(df.product_description).toarray()
tfidf_search = tfidf_vect.transform(df.search_term).toarray()


outfile = open("tfidf_des",'wb')
pickle.dump(tfidf_des, outfile)
outfile = open("tfidf_search",'wb')
pickle.dump(tfidf_search, outfile)

In [None]:
tfidf_prodtitle = tfidf_vect.fit_transform(df.product_title).toarray()
outfile = open("tfidf_prodtitle",'wb')
pickle.dump(tfidf_prodtitle, outfile)

In [None]:
tfidf_combiatt = tfidf_vect.transform(df.combined_att).toarray()
outfile = open("tfidf_combiatt",'wb')
pickle.dump(tfidf_combiatt, outfile)

In [None]:
tfidf_brand = tfidf_vect.transform(df.brand).toarray()
outfile = open("tfidf_brand",'wb')
pickle.dump(tfidf_brand, outfile)

In [None]:
import spacy
from scipy.spatial import distance
df["tfidf_cosine_distance"] = [distance.cosine(tfidf_search[i], tfidf_des[i]) for i in range(0, len(tfidf_des))]

In [None]:
df["tfidf_cosine_distance_title"] = [distance.cosine(tfidf_search[i], tfidf_prodtitle[i]) for i in range(0, len(tfidf_prodtitle))]

In [None]:
df["tfidf_cosine_distance_comb"] = [distance.cosine(tfidf_search[i], tfidf_combiatt[i]) for i in range(0, len(tfidf_combiatt))]

In [None]:
df["tfidf_cosine_distance_brand"] = [distance.cosine(tfidf_search[i], tfidf_brand[i]) for i in range(0, len(tfidf_brand))]

In [None]:
outfile = open("features_18_4",'wb')
pickle.dump(features_le, outfile)

In [None]:
features_le = df.copy()
features_le.to_csv(r'/kaggle/working/features_ht.csv', index=False)

In [None]:
features_le.to_csv('features_ht.csv', index=False)

## Model Preparation

In [None]:
import pandas as pd
#features_le = pd.DataFrame(features_le).fillna(0)
mod_train = df.iloc[:74067]
#mod_test = features_le.iloc[74067:]
#y_train = mod_train['relevance']
mod_train = df.drop(columns=['product_title','product_description','brand','search_term','combined_att'])
#mod_test = mod_test.drop(columns=['product_title','product_description','brand','search_term', 'relevance','combined_attr'])

In [None]:
mod_train.to_csv(r'/kaggle/working/mod_train.csv', index=False)
mod_train.to_csv('mod_train.csv', index=False)

In [None]:
mod_train.head()

In [None]:
mod_train = mod_train.sort_values('product_uid')

In [None]:
print(mod_train.corr())

# Using Pycaret

In [None]:
conda install -c conda-forge pycaret

In [None]:
from pycaret.regression import *

In [None]:
clf1 = setup(data = mod_train, target = 'relevance',train_size = 0.7, fold_shuffle=True, session_id = 2)

In [None]:
best= compare_models()

In [None]:
print(best)

In [None]:
%matplotlib inline

evaluate_model(best)

In [None]:
plot_model(best, plot = 'residuals')

In [None]:
plot_model(best, plot = 'feature')

In [None]:
plot_model(best, plot = 'learning curve')

In [None]:
predict_model(best)

predictions = predict_model(best, data=mod_train)
predictions.head()

save_model(best, 'my_best_pipeline')

### Train Data set

In [None]:
test = test.drop('id', axis = 1)

In [None]:
test = pd.merge(test,desc, how = 'left', on = 'product_uid')
print(test.shape)
test.head()

In [None]:
test1= merge_attributes(test)
print(test1.shape)
test1.head()

In [None]:
test1 = merge_brand(test)
print(test1.shape)
test1.head()

In [None]:
test1 = merge_description(test)
print(test1.shape)
test1.head()