In [25]:
import pandas as pd

train_df = pd.read_csv('data/train.csv', encoding='ISO-8859-1')
test_df = pd.read_csv('data/test.csv', encoding='ISO-8859-1')
product_descriptions_df = pd.read_csv('data/product_descriptions.csv', encoding='ISO-8859-1')
attributes_df = pd.read_csv('data/attributes.csv', encoding='ISO-8859-1')

print(train_df.head())
print(test_df.head())
print(product_descriptions_df.head())
print(attributes_df.head())

   id  product_uid                                      product_title  \
0   2       100001                  Simpson Strong-Tie 12-Gauge Angle   
1   3       100001                  Simpson Strong-Tie 12-Gauge Angle   
2   9       100002  BEHR Premium Textured DeckOver 1-gal. #SC-141 ...   
3  16       100005  Delta Vero 1-Handle Shower Only Faucet Trim Ki...   
4  17       100005  Delta Vero 1-Handle Shower Only Faucet Trim Ki...   

          search_term  relevance  
0       angle bracket       3.00  
1           l bracket       2.50  
2           deck over       3.00  
3    rain shower head       2.33  
4  shower only faucet       2.67  
   id  product_uid                      product_title  \
0   1       100001  Simpson Strong-Tie 12-Gauge Angle   
1   4       100001  Simpson Strong-Tie 12-Gauge Angle   
2   5       100001  Simpson Strong-Tie 12-Gauge Angle   
3   6       100001  Simpson Strong-Tie 12-Gauge Angle   
4   7       100001  Simpson Strong-Tie 12-Gauge Angle   

        

In [26]:
train_df = train_df.merge(product_descriptions_df, on='product_uid', how='left')
test_df = test_df.merge(product_descriptions_df, on='product_uid', how='left')

In [27]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

# Fill missing values in product descriptions with an empty string
train_df['product_description'].fillna('', inplace=True)
test_df['product_description'].fillna('', inplace=True)

id                     0
product_uid            0
product_title          0
search_term            0
relevance              0
product_description    0
dtype: int64
id                     0
product_uid            0
product_title          0
search_term            0
product_description    0
dtype: int64


In [28]:
import re
import nltk
from nltk.corpus import stopwords

# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
train_df['search_term'] = train_df['search_term'].apply(preprocess_text)
train_df['product_title'] = train_df['product_title'].apply(preprocess_text)
train_df['product_description'] = train_df['product_description'].apply(preprocess_text)

test_df['search_term'] = test_df['search_term'].apply(preprocess_text)
test_df['product_title'] = test_df['product_title'].apply(preprocess_text)
test_df['product_description'] = test_df['product_description'].apply(preprocess_text)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Jaccard similarity
def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

train_df['title_jaccard'] = train_df.apply(lambda x: jaccard_similarity(x['search_term'], x['product_title']), axis=1)
train_df['description_jaccard'] = train_df.apply(lambda x: jaccard_similarity(x['search_term'], x['product_description']), axis=1)

test_df['title_jaccard'] = test_df.apply(lambda x: jaccard_similarity(x['search_term'], x['product_title']), axis=1)
test_df['description_jaccard'] = test_df.apply(lambda x: jaccard_similarity(x['search_term'], x['product_description']), axis=1)

In [33]:
def word_overlap(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    return len(set1.intersection(set2))

train_df['title_overlap'] = train_df.apply(lambda x: word_overlap(x['search_term'], x['product_title']), axis=1)
train_df['description_overlap'] = train_df.apply(lambda x: word_overlap(x['search_term'], x['product_description']), axis=1)

test_df['title_overlap'] = test_df.apply(lambda x: word_overlap(x['search_term'], x['product_title']), axis=1)
test_df['description_overlap'] = test_df.apply(lambda x: word_overlap(x['search_term'], x['product_description']), axis=1)

In [38]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

X_train = train_df[['title_jaccard', 'description_jaccard', 'title_overlap', 'description_overlap']]
y_train = train_df['relevance']

models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor()
}

params_rf = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
params_xgb = {'n_estimators': [100, 200], 'max_depth': [10, 20], 'learning_rate': [0.1, 0.01]}
params_lgb = {'n_estimators': [100, 200], 'max_depth': [10, 20], 'learning_rate': [0.1, 0.01]}

params = {
    'RandomForest': params_rf,
    'XGBoost': params_xgb,
    'LightGBM': params_lgb
}

from sklearn.model_selection import GridSearchCV

def train_model(model, params):
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

for model_name, model in models.items():
    grid_search = train_model(model, params[model_name])
    model = grid_search.best_estimator_
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    print(f'{model_name} RMSE: {rmse}')


RandomForest RMSE: 0.5115847814715844
