#### Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_log_error
import scipy

#### Load data

In [2]:
train_df = pd.read_csv('data/train.tsv', sep='\t', low_memory=True)
print(train_df.shape)
print(train_df.info())
display(train_df.head())

(1482535, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482529 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB
None


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [3]:
test_df = pd.read_csv('data/test.tsv', sep='\t', low_memory=True)
print(test_df.shape)
print(test_df.info())
display(test_df.head())

(693359, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693359 entries, 0 to 693358
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   test_id            693359 non-null  int64 
 1   name               693359 non-null  object
 2   item_condition_id  693359 non-null  int64 
 3   category_name      690301 non-null  object
 4   brand_name         397834 non-null  object
 5   shipping           693359 non-null  int64 
 6   item_description   693359 non-null  object
dtypes: int64(3), object(4)
memory usage: 37.0+ MB
None


Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


#### Combine train and test data

In [4]:
all_df = pd.concat([train_df.drop(["price"], axis=1), test_df],axis=0).reset_index(drop=True)

#### Scale 'price'

In [None]:
target = np.log1p(train_df["price"].values)

#### Handle missing values

In [5]:
all_df["category_name"].fillna("NaN", inplace=True)
all_df["brand_name"].fillna("None", inplace=True)
all_df["item_description"].fillna("No description yet", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["category_name"].fillna("NaN", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  all_df["brand_name"].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [6]:
drop_brand_list = all_df["brand_name"].value_counts().index[300:]
all_df["brand_name"] = all_df["brand_name"].map(lambda brand: "None" if brand in drop_brand_list else brand)

In [7]:
all_df["last_category_name"] = all_df["category_name"].map(lambda text: "NaN" if text == "NaN" else str(text).split("/")[-1])

In [8]:
all_df["brand_name"] = all_df["brand_name"].astype("category")
all_df["item_condition_id"] = all_df["item_condition_id"].astype("category")
all_df["shipping"] = all_df["shipping"].astype("category")

In [9]:
count_name = CountVectorizer(min_df=10)
X_name = count_name.fit_transform(all_df["name"])

In [10]:
tfidf_description = TfidfVectorizer(max_features=200, stop_words="english", ngram_range=(1,3))
X_description = tfidf_description.fit_transform(all_df["item_description"])

In [12]:
X_dummies = scipy.sparse.csr_matrix(pd.get_dummies(all_df[["item_condition_id", "shipping", "last_category_name", "brand_name"]], sparse=True).values)

In [13]:
X = scipy.sparse.hstack((X_name, X_description, X_dummies)).tocsr()

In [14]:
shape = train_df.shape[0]
train = X[:shape]
test = X[shape:]
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.3, random_state=42)

In [15]:
ridge = Ridge()
param_grid = {'alpha': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_log_error')
grid_search.fit(X_train, y_train)

In [16]:
best_ridge = grid_search.best_estimator_
print(f"Best alpha: {grid_search.best_params_}")

Best alpha: {'alpha': 10}


In [23]:
y_val_pred = best_ridge.predict(X_val)
# rmsle = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
# print(f"RMSLE: {rmsle}")

RMSLE: 0.12687610398181118


In [24]:
RMSLE = np.sqrt(np.mean(np.square(y_val_pred - y_val)))
print(f"RMSLE: {RMSLE}")

RMSLE: 0.494789924391536
