In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from joblib import dump, load
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge, Lasso
import tensorflow as tf
import warnings
from TransformationPipeline import TransformationPipeline
warnings.filterwarnings('ignore')

### **1. Loading Data**

In [2]:
data = pd.read_csv("data.csv")

In [30]:
data.head()

Unnamed: 0,marka,naw3,9at3a,khochn,toul,3ordh,R,G,B,soum
1,HA,confy,Shoes,1.038333,46.4,54.0,128,23,27,180.9
2,Zen,confy,Shirt,1.021667,40.8,54.0,121,22,25,143.8
3,Bey&Bey,confy,Shirt,1.065,24.8,56.0,101,18,21,54.4
4,Armani,classy,Jacket,1.041667,72.0,61.0,146,26,30,345.9
5,Armani,sport,Shirt,1.038333,81.6,59.0,151,27,32,463.3


In [4]:
# df.rename(columns={'marka': 'Brand' ,"naw3" : "Class" , "9at3a" : "Category" , "toul" : "Height" , "khochn":"Depth", "3ordh": "Width","soum":"Price"}, inplace=True)
# df =df.drop(columns="ID")

In [5]:
data['9at3a'] = data['9at3a'].replace('kabbout', 'Jacket')
data['9at3a'] = data['9at3a'].replace('sabbat', 'Shoes')
data['9at3a'] = data['9at3a'].replace('maryoul', 'Shirt')
data['9at3a'] = data['9at3a'].replace('t-shirt', 'T_shirt')
data['9at3a'] = data['9at3a'].replace('jacket', 'Jacket')
data['9at3a'] = data['9at3a'].replace('serwel', 'Pants')
data = data[data['9at3a'] != 'hoodie']
data = data[data['9at3a'] != 'socks']
data =data.drop(columns="ID")

In [6]:
data.nunique()

marka         5
naw3          7
9at3a         5
khochn      170
toul        256
3ordh       125
R           139
G            30
B            40
soum      10978
dtype: int64

### **2. Data Checks**

In [7]:
data.isna().sum()

marka     0
naw3      0
9at3a     0
khochn    0
toul      0
3ordh     0
R         0
G         0
B         0
soum      0
dtype: int64

In [8]:
data.duplicated().sum()

366

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46290 entries, 0 to 48547
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   marka   46290 non-null  object 
 1   naw3    46290 non-null  object 
 2   9at3a   46290 non-null  object 
 3   khochn  46290 non-null  float64
 4   toul    46290 non-null  float64
 5   3ordh   46290 non-null  float64
 6   R       46290 non-null  int64  
 7   G       46290 non-null  int64  
 8   B       46290 non-null  int64  
 9   soum    46290 non-null  float64
dtypes: float64(4), int64(3), object(3)
memory usage: 3.9+ MB


In [10]:
data.nunique()

marka         5
naw3          7
9at3a         5
khochn      170
toul        256
3ordh       125
R           139
G            30
B            40
soum      10978
dtype: int64

In [11]:
data.describe()

Unnamed: 0,khochn,toul,3ordh,R,G,B,soum
count,46290.0,46290.0,46290.0,46290.0,46290.0,46290.0,46290.0
mean,1.029034,64.06232,57.481488,135.830676,24.366775,27.920696,396.606492
std,0.023476,37.529743,2.231935,26.477142,4.950381,5.645137,399.668502
min,0.716667,16.0,43.0,0.0,0.0,0.0,32.6
25%,1.016667,32.0,56.0,112.0,20.0,23.0,95.1
50%,1.03,56.8,57.0,135.0,24.0,28.0,245.6
75%,1.041667,83.2,59.0,155.0,28.0,32.0,538.4
max,1.316667,280.8,95.0,229.0,255.0,255.0,1882.3


In [12]:
data = data[data["toul"]<260]
data = data.loc[(data["3ordh"] < 70) & (data["3ordh"] > 51)]
data = data.loc[(data["khochn"] < 1.16) & (data["khochn"] > 0.92)]
data = data.loc[(data["R"] < 240) & (data["R"] > 100)]
data = data.loc[(data["G"] < 50) & (data["G"] > 10)]
data = data.loc[(data["B"] < 53) & (data["B"] > 17)]

In [13]:
X = data.drop('soum', axis = 1)
y = data.soum

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state= 1)
X_train.shape, X_test.shape

((35686, 9), (8922, 9))

In [15]:
preprocessor = TransformationPipeline().preprocess(d=data)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [16]:
dump(preprocessor,"ClothesPreprocessor.joblib")

['ClothesPreprocessor.joblib']

In [17]:
models = {
    'ridge' : Ridge(),
    'xgboost' : XGBRegressor(),
    'catboost' : CatBoostRegressor(verbose=0),
    'lightgbm' : LGBMRegressor(),
}

In [18]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f'{name} trained')

ridge trained
xgboost trained
catboost trained
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001853 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 35686, number of used features: 23
[LightGBM] [Info] Start training from score 409.089332
lightgbm trained


In [19]:
for name, model1 in models.items():
    predictions = model1.predict(X_test)
    print(name , r2_score(y_test,predictions))

ridge 0.9340987759654636
xgboost 0.9800600349212047
catboost 0.9818340081940078
lightgbm 0.981493603663101


In [20]:
# dump(models["catboost"],"Catboost.joblib")

['Catboost.joblib']