In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler,FunctionTransformer,OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import pandas as pd
import numpy as np

In [3]:
df1 = pd.read_csv('fastFashionCompDim.csv',sep='|')
df1.head()

Unnamed: 0,item_code,part_name,material,percent
0,200000,EXTERIOR,algodon,100%
1,200001,EXTERIOR,algodon,100%
2,200002,EXTERIOR,viscosa,62%
3,200002,EXTERIOR,fibra metalizada,37%
4,200002,EXTERIOR,elastano,1%


In [4]:
df2 = pd.read_csv('fastFasionItemsDim.csv',sep='|')
df2

Unnamed: 0,item_code,item_name,item_desc,join_life,joinlife_title,joinlife_desc,item_price
0,200000,CAMISA POPELÍN,"""Camisa de cuello solapa y escote pico. Manga ...",True,JOIN LIFE Care for fiber: 100% algodon organico.,"""Algodon cultivado utilizando fertilizantes y ...",1995
1,200001,CAMISA POPELÍN,"""Camisa de cuello solapa y escote pico. Manga ...",True,JOIN LIFE Care for fiber: 100% algodon organico.,"""Algodon cultivado utilizando fertilizantes y ...",1995
2,200002,BLUSA HILO METALIZADO,"""Blusa semitransparente de cuello solapa y esc...",False,,,3995
3,200003,BLUSA SATINADA ALAMARES,"""Blusa de cuello subido y escote pico. Manga l...",False,,,2995
4,200004,BLUSA ESTAMPADA CROPPED,"""Blusa satinada de cuello solapa y manga larga...",False,,,1995
...,...,...,...,...,...,...,...
271,500033,PANTALÓN PITILLO,"""Pantalon de tiro medio. Cintura con elastico ...",True,JOIN LIFE Care for fiber: al menos 25% poliest...,"""Esta fibra se obtiene a partir del reciclaje ...",1299
272,500034,PANTALÓN CINTURÓN RAFIA,"""Pantalon de tiro alto con cintura elastica. B...",False,,,1599
273,500035,PANTALÓN ESTAMPADO,"""Pantalon de tiro alto. Cintura elastica ajust...",False,,,1599
274,500036,PANTALÓN GARDEN,"""The Garden Pant In Grey.<br/><br/>Pantalon de...",False,,,1599


In [5]:
merged_df = pd.merge(df1, df2, on='item_code', how='inner')

# Select and rearrange the required columns
df = merged_df[['part_name', 'material', 'join_life', 'item_price', 'percent']]

df.head()

Unnamed: 0,part_name,material,join_life,item_price,percent
0,EXTERIOR,algodon,True,1995,100%
1,EXTERIOR,algodon,True,1995,100%
2,EXTERIOR,viscosa,False,3995,62%
3,EXTERIOR,fibra metalizada,False,3995,37%
4,EXTERIOR,elastano,False,3995,1%


In [6]:
df.duplicated().sum()

217

In [7]:
df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [8]:
df.shape

(240, 5)

In [9]:
df = df[df['material'] != 'camel']

In [10]:
df.shape

(240, 5)

In [11]:
translation_dict = {
    'algodon': 'cotton',
    'viscosa': 'viscose',
    'fibra metalizada': 'fiber',
    'elastano': 'elastane',
    'poliester': 'polyester',
    'lino': 'linen',
    'liocel': 'lyocell',
    'poliamida': 'polyamide',
    'nailon': 'nylon',
    'lana': 'wool',
    'acrilico': 'acrylic',
    'camello': 'camel',
    'cupro': 'cupro',
    'modal': 'modal'
}

# Replace Spanish words with English words in the 'material' column
df.loc[:, 'material'] = df['material'].replace(translation_dict)

# Display the updated DataFrame
df.tail()

Unnamed: 0,part_name,material,join_life,item_price,percent
449,EXTERIOR,elastane,True,1299,2%
452,EXTERIOR,cotton,False,1599,97%
454,EXTERIOR,polyester,False,1299,67%
455,EXTERIOR,viscose,False,1299,29%
456,EXTERIOR,elastane,False,1299,4%


In [12]:
df['percent'] = df['percent'].str.rstrip('%').astype(int)

In [13]:
df.head()

Unnamed: 0,part_name,material,join_life,item_price,percent
0,EXTERIOR,cotton,True,1995,100
2,EXTERIOR,viscose,False,3995,62
3,EXTERIOR,fiber,False,3995,37
4,EXTERIOR,elastane,False,3995,1
5,EXTERIOR,viscose,False,2995,100


In [14]:
df.loc[:, 'percent'] = 100 - df['percent']
df.head()

Unnamed: 0,part_name,material,join_life,item_price,percent
0,EXTERIOR,cotton,True,1995,0
2,EXTERIOR,viscose,False,3995,38
3,EXTERIOR,fiber,False,3995,63
4,EXTERIOR,elastane,False,3995,99
5,EXTERIOR,viscose,False,2995,0


In [15]:
df['part_name'] = df['part_name'].replace('FORRO', 'INTERIOR')

In [16]:
df = df[["part_name", "join_life", "material", "item_price","percent"]]
df.head()

Unnamed: 0,part_name,join_life,material,item_price,percent
0,EXTERIOR,True,cotton,1995,0
2,EXTERIOR,False,viscose,3995,38
3,EXTERIOR,False,fiber,3995,63
4,EXTERIOR,False,elastane,3995,99
5,EXTERIOR,False,viscose,2995,0


In [17]:
df = df.rename(columns={'join_life': 'eco_friendly', 'percent': 'rewards'})
df.head()

Unnamed: 0,part_name,eco_friendly,material,item_price,rewards
0,EXTERIOR,True,cotton,1995,0
2,EXTERIOR,False,viscose,3995,38
3,EXTERIOR,False,fiber,3995,63
4,EXTERIOR,False,elastane,3995,99
5,EXTERIOR,False,viscose,2995,0


In [18]:

for i in df.columns:
    print(df[i].unique())

['EXTERIOR' 'INTERIOR']
[ True False]
['cotton' 'viscose' 'fiber' 'elastane' 'polyester' 'linen' 'lyocell'
 'polyamide' 'nylon' 'wool' 'acrylic' 'camel' 'cupro' 'modal']
[1995 3995 2995 1299 1599 2595 2295 1999  799  999 1795 1595 1295  595
  795  995  599]
[ 0 38 63 99 48 58 94  3 97 21 82 26 74 45 60 95 47 53 23 81 96 42 70 88
 41 89 29 77 40 65 46 55  1 56  4  8 92 37 67 10 90 78 79 36 66 98 69 30
 34 68 22  2 50 32 72 24 75 51 52  5 12 93 61 84 80 71 54 83 35 76 49 64
 33 25 85]


In [19]:
X = df.iloc[:,0:4]
X.head()

Unnamed: 0,part_name,eco_friendly,material,item_price
0,EXTERIOR,True,cotton,1995
2,EXTERIOR,False,viscose,3995
3,EXTERIOR,False,fiber,3995
4,EXTERIOR,False,elastane,3995
5,EXTERIOR,False,viscose,2995


In [20]:
 y = df.iloc[:,-1]
y.head()

0     0
2    38
3    63
4    99
5     0
Name: rewards, dtype: int32

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=2)

In [23]:
for i in X_train.columns:
    print(X_train[i].unique())

['EXTERIOR' 'INTERIOR']
[False  True]
['elastane' 'viscose' 'acrylic' 'cotton' 'polyester' 'lyocell' 'polyamide'
 'nylon' 'fiber' 'modal' 'camel' 'linen' 'wool' 'cupro']
[2995  995 1599 1795 1995 1295 2595  999 1299 2295 3995  599 1595  795
 1999  595  799]


In [22]:
material_order = [
    "acrylic",         # Most harmful
    "polyester",
    "elastane",
    "nylon",
    "polyamide",
    "wool",
    "camel",
    "fiber",          # General term, placed moderately
    "viscose",
    "cupro",
    "modal",
    "cotton",         # Regular cotton
    "lyocell",
    "linen"               #   # Least harmful
]

In [23]:
# ColumnTransformer setup
transformer = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(sparse_output=False,drop='first'), [0, 1,2]),
        # ('ordinal_material', OrdinalEncoder(categories=[material_order]), [2]),
        ('scaler_item_price', MinMaxScaler(), ['item_price']),
         # Categorical encoding for 'material'
    ],
    remainder='passthrough'
)

In [24]:
from sklearn import set_config
set_config(display='diagram')

In [25]:
# Define the Ridge Regression model
ridge = Ridge()

# Define the hyperparameter grid for alpha (regularization strength)
param_grid = {'alpha': [0.001,0.01, 0.1, 1, 10, 100]}

# Use GridSearchCV to find the best alpha
grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

In [27]:
# Create a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', transformer),
    ('regressor', grid_search)
])

In [28]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [29]:
y_pred = pipeline.predict(X_test)

In [32]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f'R² score: {r2}')

R² score: 0.5923806516136935


In [33]:
import pickle
# Save the trained pipeline with ColumnTransformer to a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [30]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_squared_error

# # Define the Decision Tree model
# decision_tree = DecisionTreeRegressor(random_state=42)

# # Define the hyperparameter grid
# param_grid = {
#     'max_depth': [3, 5, 10, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Use GridSearchCV to find the best parameters
# grid_search = GridSearchCV(estimator=decision_tree, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search.fit(X_train, y_train)

# # Retrieve the best model
# best_tree_model = grid_search.best_estimator_

# # Print the best hyperparameters and the corresponding score
# print("Best parameters:", grid_search.best_params_)
# print("Best CV R^2 score:", grid_search.best_score_)

# # Evaluate on the test data
# test_r2_score = best_tree_model.score(X_test, y_test)
# test_mse = mean_squared_error(y_test, best_tree_model.predict(X_test))

# print("Test R^2 score:", test_r2_score)
# print("Test MSE:", test_mse)


In [31]:
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_squared_error

# # Define the KNN Regressor model
# knn = KNeighborsRegressor()

# # Define the hyperparameter grid
# param_grid = {
#     'n_neighbors': [3, 5, 7, 10],  # Number of neighbors
#     'weights': ['uniform', 'distance'],  # Weighting function
#     'p': [1, 2]  # Distance metric: 1=Manhattan, 2=Euclidean
# }

# # Use GridSearchCV to find the best parameters
# grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

# # Fit the grid search to the training data
# grid_search.fit(X_train, y_train)

# # Retrieve the best model
# best_knn_model = grid_search.best_estimator_

# # Print the best hyperparameters and the corresponding score
# print("Best parameters:", grid_search.best_params_)
# print("Best CV R^2 score:", grid_search.best_score_)

# # Evaluate on the test data
# test_r2_score = best_knn_model.score(X_test, y_test)
# test_mse = mean_squared_error(y_test, best_knn_model.predict(X_test))

# print("Test R^2 score:", test_r2_score)
# print("Test MSE:", test_mse)
