In [728]:
import pandas as pd
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import Birch
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from catboost import Pool
from sklearn.metrics import accuracy_score

In [729]:
user_data = pd.read_csv("../data/customers.csv").dropna()
actions_data = pd.read_csv("../data/dataset.csv").dropna().astype(int)
cars_data = pd.read_csv("../data/cars_about.csv").dropna()

## Preprocess user data

In [730]:
user_data

Unnamed: 0,customer_id,gender,married,age,graduated,profession,familySize
0,45,Male,No,22,No,Healthcare,4.0
1,817,Female,Yes,38,Yes,Engineer,3.0
2,495,Female,Yes,67,Yes,Engineer,1.0
3,36,Male,Yes,67,Yes,Lawyer,2.0
4,76,Female,Yes,40,Yes,Entertainment,6.0
...,...,...,...,...,...,...,...
315,353,Male,Yes,74,Yes,Lawyer,2.0
316,411,Female,No,30,No,Homemaker,4.0
317,297,Female,No,37,Yes,Artist,2.0
318,186,Female,No,22,No,Marketing,1.0


In [731]:
# Identify the categorical columns
cat_cols = ['gender', 'married', 'graduated', 'profession']

# Convert the categorical columns to numerical using Label Encoding
for col in cat_cols:
    le = LabelEncoder()
    user_data[col] = le.fit_transform(user_data[col])

user_data

Unnamed: 0,customer_id,gender,married,age,graduated,profession,familySize
0,45,1,0,22,0,5,4.0
1,817,0,1,38,1,2,3.0
2,495,0,1,67,1,2,1.0
3,36,1,1,67,1,7,2.0
4,76,0,1,40,1,3,6.0
...,...,...,...,...,...,...,...
315,353,1,1,74,1,7,2.0
316,411,0,0,30,0,6,4.0
317,297,0,0,37,1,0,2.0
318,186,0,0,22,0,8,1.0


## Preprocess cars data

In [732]:
cars_data = pd.read_csv("../data/cars_about.csv").dropna()

cars_data["brand"] = cars_data.car_model.apply(lambda a: a.split()[0])
cars_data = cars_data.drop(columns=["car_model", "engine", "car_id"])
cars_data["price"] = cars_data['price'].apply(lambda a: int(a[1:]))

cars_data['used_label'] = cars_data["used_label"].apply(lambda a: 1 if a=="Used" else 0)
# Identify the categorical columns
car_cat_cols = ['exteriorColor', 'interiorColor', 'drivetrain', 'fuelType', 'transmission', 'brand']

# Convert the categorical columns to numerical using Label Encoding
for col in car_cat_cols:
    le = LabelEncoder()
    cars_data[col] = le.fit_transform(cars_data[col])

cars_data

Unnamed: 0,used_label,price,exteriorColor,interiorColor,drivetrain,minMPG,maxMPG,fuelType,transmission,mileage,brand
0,1,39,139,11,4,19.0,27.0,3,20,29403.0,28
1,1,49,145,11,3,19.0,24.0,3,2,32929.0,10
2,1,41,62,11,3,15.0,21.0,3,20,23173.0,25
3,1,28,66,83,4,29.0,35.0,3,28,10598.0,13
4,1,49,53,10,4,20.0,27.0,3,20,28137.0,19
...,...,...,...,...,...,...,...,...,...,...,...
399,1,33,46,11,3,18.0,25.0,3,20,43807.0,16
400,0,30,163,57,2,34.0,30.0,5,28,41944.0,28
401,1,35,11,11,2,22.0,29.0,3,24,49386.0,5
402,1,35,168,49,3,16.0,22.0,3,9,115795.0,6


## Preprocess actions data

In [733]:
actions_data = actions_data.rename(columns = {"item_id":"car_id"})

In [734]:
actions_pivot_table = pd.pivot_table(actions_data.sort_values(by="car_id"), values='interaction', index='user_id', columns='car_id').fillna(0)

In [735]:
actions_data.car_id.nunique()

109

In [736]:
px.imshow(actions_pivot_table)

In [737]:
actions_data = pd.melt(actions_pivot_table.reset_index(), id_vars='user_id', value_vars=actions_pivot_table.columns).rename(columns={"value":"interaction"})

In [738]:
actions_data.interaction.unique()

array([1, 0])

In [739]:
actions_data

Unnamed: 0,user_id,car_id,interaction
0,0,0,1
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
8170,70,402,0
8171,71,402,0
8172,72,402,0
8173,73,402,0


In [740]:
actions_data.interaction = actions_data.interaction.apply(lambda a: 1 if a > 0.5 else 0)

In [741]:
actions_data

Unnamed: 0,user_id,car_id,interaction
0,0,0,1
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
8170,70,402,0
8171,71,402,0
8172,72,402,0
8173,73,402,0


## CatBoost inputs

1. собираю фичи топ n понравившихся авто
2. конкатенирую фичи понравившихся авто с таргет авто
3. делаю бинарную классификацию 

In [742]:
N_POSITIVE = 5

In [743]:
cars_data

Unnamed: 0,used_label,price,exteriorColor,interiorColor,drivetrain,minMPG,maxMPG,fuelType,transmission,mileage,brand
0,1,39,139,11,4,19.0,27.0,3,20,29403.0,28
1,1,49,145,11,3,19.0,24.0,3,2,32929.0,10
2,1,41,62,11,3,15.0,21.0,3,20,23173.0,25
3,1,28,66,83,4,29.0,35.0,3,28,10598.0,13
4,1,49,53,10,4,20.0,27.0,3,20,28137.0,19
...,...,...,...,...,...,...,...,...,...,...,...
399,1,33,46,11,3,18.0,25.0,3,20,43807.0,16
400,0,30,163,57,2,34.0,30.0,5,28,41944.0,28
401,1,35,11,11,2,22.0,29.0,3,24,49386.0,5
402,1,35,168,49,3,16.0,22.0,3,9,115795.0,6


In [744]:
column_names = []
cat_columns = ['used_label', 'exteriorColor', 'interiorColor', 'drivetrain', 'fuelType', 'transmission', 'brand']
cat_features = []

for index in range(N_POSITIVE):
    for column in cars_data.columns:
        column_names.append(f"car_{index}_{column}")
        
for column in cars_data.columns:
    column_names.append(f"target_{column}")

for column in column_names:
    for c_column in cat_columns:
        if c_column in column:
            cat_features.append(column)
        
dataset = pd.DataFrame(columns=column_names + ["interaction"])

for user_id in actions_data.user_id.unique():
    for target_car_id in actions_data.car_id.unique():
        # len(cars_data.columns) * (N_POSITIVE + 1))
        matrix_features = []
        features = []
        current_positive_samples = actions_data.query(f"user_id == {user_id}").query(f"car_id != {target_car_id}").query("interaction == 1")
        # print()
        if len(current_positive_samples) < N_POSITIVE: continue

        for car_id in current_positive_samples.sample(N_POSITIVE).car_id.to_list():
            matrix_features.append(cars_data.iloc[car_id].to_list())
        matrix_features.append(cars_data.iloc[target_car_id].to_list())
        matrix_features.append(int(actions_data.loc[actions_data.car_id == target_car_id].query(f"user_id == {user_id}").interaction))
        
        for item in matrix_features:
            if isinstance(item, list):
                for elem in item:
                    features.append(elem)
            else:
                features.append(item)
        # print(features)
        dataset = pd.concat([dataset, pd.DataFrame.from_records([dict(zip(dataset.columns, features))])])

# dataset = dataset.astype(int)

In [745]:
dataset

Unnamed: 0,car_0_used_label,car_0_price,car_0_exteriorColor,car_0_interiorColor,car_0_drivetrain,car_0_minMPG,car_0_maxMPG,car_0_fuelType,car_0_transmission,car_0_mileage,...,target_exteriorColor,target_interiorColor,target_drivetrain,target_minMPG,target_maxMPG,target_fuelType,target_transmission,target_mileage,target_brand,interaction
0,1.0,35.0,168.0,49.0,3.0,16.0,22.0,3.0,9.0,115795.0,...,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,28.0,1
0,1.0,23.0,173.0,8.0,2.0,18.0,27.0,3.0,9.0,99697.0,...,62.0,11.0,3.0,15.0,21.0,3.0,20.0,23173.0,25.0,0
0,1.0,23.0,11.0,83.0,5.0,17.0,23.0,3.0,6.0,105469.0,...,66.0,83.0,4.0,29.0,35.0,3.0,28.0,10598.0,13.0,0
0,1.0,42.0,76.0,74.0,5.0,19.0,28.0,3.0,20.0,10236.0,...,53.0,10.0,4.0,20.0,27.0,3.0,20.0,28137.0,19.0,0
0,1.0,18.0,85.0,68.0,5.0,21.0,30.0,3.0,15.0,58157.0,...,11.0,83.0,5.0,17.0,23.0,3.0,6.0,105469.0,28.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.0,39.0,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,...,49.0,44.0,4.0,20.0,26.0,3.0,20.0,24793.0,17.0,1
0,1.0,25.0,180.0,83.0,5.0,15.0,22.0,0.0,9.0,135228.0,...,11.0,11.0,2.0,21.0,28.0,3.0,24.0,10769.0,22.0,0
0,1.0,39.0,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,...,56.0,35.0,2.0,19.0,26.0,3.0,20.0,22492.0,10.0,0
0,1.0,39.0,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,...,90.0,25.0,4.0,26.0,33.0,3.0,28.0,23320.0,23.0,0


In [746]:
dataset.shape

(6931, 67)

In [747]:
dataset.interaction.value_counts()

0    6438
1     493
Name: interaction, dtype: int64

In [748]:
dataset

Unnamed: 0,car_0_used_label,car_0_price,car_0_exteriorColor,car_0_interiorColor,car_0_drivetrain,car_0_minMPG,car_0_maxMPG,car_0_fuelType,car_0_transmission,car_0_mileage,...,target_exteriorColor,target_interiorColor,target_drivetrain,target_minMPG,target_maxMPG,target_fuelType,target_transmission,target_mileage,target_brand,interaction
0,1.0,35.0,168.0,49.0,3.0,16.0,22.0,3.0,9.0,115795.0,...,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,28.0,1
0,1.0,23.0,173.0,8.0,2.0,18.0,27.0,3.0,9.0,99697.0,...,62.0,11.0,3.0,15.0,21.0,3.0,20.0,23173.0,25.0,0
0,1.0,23.0,11.0,83.0,5.0,17.0,23.0,3.0,6.0,105469.0,...,66.0,83.0,4.0,29.0,35.0,3.0,28.0,10598.0,13.0,0
0,1.0,42.0,76.0,74.0,5.0,19.0,28.0,3.0,20.0,10236.0,...,53.0,10.0,4.0,20.0,27.0,3.0,20.0,28137.0,19.0,0
0,1.0,18.0,85.0,68.0,5.0,21.0,30.0,3.0,15.0,58157.0,...,11.0,83.0,5.0,17.0,23.0,3.0,6.0,105469.0,28.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.0,39.0,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,...,49.0,44.0,4.0,20.0,26.0,3.0,20.0,24793.0,17.0,1
0,1.0,25.0,180.0,83.0,5.0,15.0,22.0,0.0,9.0,135228.0,...,11.0,11.0,2.0,21.0,28.0,3.0,24.0,10769.0,22.0,0
0,1.0,39.0,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,...,56.0,35.0,2.0,19.0,26.0,3.0,20.0,22492.0,10.0,0
0,1.0,39.0,139.0,11.0,4.0,19.0,27.0,3.0,20.0,29403.0,...,90.0,25.0,4.0,26.0,33.0,3.0,28.0,23320.0,23.0,0


In [749]:
px.bar(dataset.interaction.value_counts())

In [924]:
balanced_dataset = pd.concat([dataset.loc[dataset.interaction == 1], dataset.loc[dataset.interaction == 0].sample(int(dataset.interaction.value_counts()[1] * 2))])

In [925]:
px.bar(balanced_dataset.interaction.value_counts())

In [926]:
# px.bar(balanced_dataset.interaction.value_counts())

In [927]:
# from sklearn.preprocessing import MinMaxScaler
# x = balanced_dataset.values #returns a numpy array
# min_max_scaler = MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df = pd.DataFrame(x_scaled)

In [928]:
from sklearn.model_selection import train_test_split
# Split into training and validation sets
# balanced_dataset = (balanced_dataset-balanced_dataset.min())/(balanced_dataset.max()-balanced_dataset.min())
train_df, val_df = train_test_split(balanced_dataset.astype(int), test_size=0.2, random_state=42)
X_train, y_train = train_df.drop(columns=['interaction']), train_df[["interaction"]]

# Split into validation and test sets
val_df, test_df = train_test_split(val_df, test_size=0.2, random_state=42)

X_val, y_val = val_df.drop(columns=['interaction']), val_df[["interaction"]]
X_test, y_test = test_df.drop(columns=['interaction']), test_df[["interaction"]]

In [929]:
X_train.shape, X_val.shape, X_test.shape

((1183, 66), (236, 66), (60, 66))

In [930]:
X_train

Unnamed: 0,car_0_used_label,car_0_price,car_0_exteriorColor,car_0_interiorColor,car_0_drivetrain,car_0_minMPG,car_0_maxMPG,car_0_fuelType,car_0_transmission,car_0_mileage,...,target_price,target_exteriorColor,target_interiorColor,target_drivetrain,target_minMPG,target_maxMPG,target_fuelType,target_transmission,target_mileage,target_brand
0,1,107,11,11,3,14,19,3,2,28000,...,20,2,44,4,21,34,3,9,73467,13
0,1,51,11,11,2,21,28,3,24,10769,...,44,11,11,2,18,25,3,36,38336,3
0,1,18,85,68,5,21,30,3,15,58157,...,35,15,43,4,24,31,3,28,19527,15
0,1,24,147,13,3,17,22,3,6,151989,...,4,166,11,2,18,24,3,26,213000,17
0,0,33,11,83,5,23,34,3,24,35825,...,16,66,11,5,19,28,3,9,48605,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,38,120,56,4,19,28,3,24,14494,...,65,122,82,2,118,107,3,1,32600,27
0,1,36,114,44,4,20,27,3,20,47729,...,65,122,82,2,118,107,3,1,32600,27
0,1,37,93,62,2,21,26,3,2,5188,...,71,19,20,2,0,0,6,33,10500,3
0,1,18,139,11,3,14,19,0,9,110111,...,44,11,11,2,18,25,3,36,38336,3


In [931]:
X_train.columns

Index(['car_0_used_label', 'car_0_price', 'car_0_exteriorColor',
       'car_0_interiorColor', 'car_0_drivetrain', 'car_0_minMPG',
       'car_0_maxMPG', 'car_0_fuelType', 'car_0_transmission', 'car_0_mileage',
       'car_0_brand', 'car_1_used_label', 'car_1_price', 'car_1_exteriorColor',
       'car_1_interiorColor', 'car_1_drivetrain', 'car_1_minMPG',
       'car_1_maxMPG', 'car_1_fuelType', 'car_1_transmission', 'car_1_mileage',
       'car_1_brand', 'car_2_used_label', 'car_2_price', 'car_2_exteriorColor',
       'car_2_interiorColor', 'car_2_drivetrain', 'car_2_minMPG',
       'car_2_maxMPG', 'car_2_fuelType', 'car_2_transmission', 'car_2_mileage',
       'car_2_brand', 'car_3_used_label', 'car_3_price', 'car_3_exteriorColor',
       'car_3_interiorColor', 'car_3_drivetrain', 'car_3_minMPG',
       'car_3_maxMPG', 'car_3_fuelType', 'car_3_transmission', 'car_3_mileage',
       'car_3_brand', 'car_4_used_label', 'car_4_price', 'car_4_exteriorColor',
       'car_4_interiorColor', '

In [932]:
# Create CatBoost model
model = CatBoostClassifier(iterations=5000, 
                           learning_rate=0.05, 
                           eval_metric='AUC',
                           custom_metric=['Accuracy', 'Precision', 'Recall', 'F1', 'TotalF1'],
                           loss_function='CrossEntropy', 
                           cat_features=cat_features, 
                           random_seed=42)

# Fit the model on the training data
model.fit(X_train, y_train, 
          eval_set=(X_val, y_val), 
          early_stopping_rounds=50,
          verbose=10)

# Make predictions on the test data
y_pred = model.predict(X_train)


iteritems is deprecated and will be removed in a future version. Use .items instead.



0:	test: 0.5833750	best: 0.5833750 (0)	total: 2.82ms	remaining: 14.1s
10:	test: 0.6982399	best: 0.7021605 (8)	total: 39.2ms	remaining: 17.8s
20:	test: 0.6978645	best: 0.7040791 (11)	total: 74.2ms	remaining: 17.6s
30:	test: 0.6990324	best: 0.7040791 (11)	total: 108ms	remaining: 17.3s
40:	test: 0.6965299	best: 0.7040791 (11)	total: 144ms	remaining: 17.5s
50:	test: 0.7007007	best: 0.7040791 (11)	total: 179ms	remaining: 17.4s
60:	test: 0.6991992	best: 0.7040791 (11)	total: 216ms	remaining: 17.5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7040790791
bestIteration = 11

Shrink model to first 12 iterations.


In [933]:
model.save_model("../weights/catboost")

In [934]:
submission_table = pd.DataFrame(dict(gt = y_train.interaction, pred = y_pred))

In [935]:
submission_table

Unnamed: 0,gt,pred
0,1,0
0,1,0
0,0,0
0,0,0
0,0,0
...,...,...
0,0,0
0,0,0
0,0,0
0,0,0


In [936]:
n_correct = 0
for i in range(len(submission_table)):
    current_row = list(submission_table.iloc[i])
    if current_row[0] == current_row[1]:
        n_correct += 1
print(f"test accurracy: {n_correct/len(y_pred)}")

test accurracy: 0.7802197802197802


In [937]:
from sklearn import metrics
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba)

# The histogram of scores compared to true labels
fig_hist = px.histogram(
    x=y_pred_proba, color=y_test.interaction.to_list(), nbins=50,
    labels=dict(color='True Labels', x='Score')
)

fig_hist.show()


# Evaluating model performance at various thresholds
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [938]:
from sklearn.metrics import auc
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f}) ::: CatBoost',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

In [939]:
import pandas as pd
import numpy as np
import plotly.express as px
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import Birch
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from catboost import Pool
from sklearn.metrics import accuracy_score

cars_data = pd.read_csv("../data/cars_about.csv").dropna()
encoded_cars_data = pd.read_csv("../data/cars_data_prepared.csv")

class CatPredictor:
    def __init__(self, catboost_path="../weights/catboost"):
        self.catboost = CatBoostClassifier().load_model(catboost_path)
        self.N_POSITIVE = 5
        
        
    def _preprocess_user_interactions(self, user_interactions):
        encoded_cars = encoded_cars_data[encoded_cars_data['car_id'].isin(user_interactions.car_id)].drop(columns=["car_id"])
        encoded_cars = encoded_cars.sample(self.N_POSITIVE)
        current_interactions = pd.DataFrame(
            dict(
                (f"car_{k}_{column}", [encoded_cars[column].iloc[k]]) for k in range(self.N_POSITIVE) \
                    for column in encoded_cars.columns
            )
        )
        return current_interactions
    
    def _create_batch(self, user_interactions):
        renamed_data = encoded_cars_data.rename(
            columns = dict(
                zip(
                    encoded_cars_data.columns, [f'target_{column}' for column in encoded_cars_data.columns]
                    )
                )
            ).drop(columns=["target_car_id"])
        
        # pd.concat([user_interactions]*len(renamed_data)).join(renamed_data)
        batch = pd.concat(
            [pd.concat([user_interactions]*len(renamed_data), ignore_index=True), renamed_data],
            axis=1
            )

        return batch
    
    def predict(self, user_interactions_path="../data/user_interactions.csv"):
        user_interactions = pd.read_csv(user_interactions_path)
        assert len(user_interactions) >= self.N_POSITIVE
        user_interactions = self._preprocess_user_interactions(user_interactions)
        batch = self._create_batch(user_interactions)
        return self.catboost.predict(batch)

catpred = CatPredictor()
preds = catpred.predict()


iteritems is deprecated and will be removed in a future version. Use .items instead.



In [940]:
preds

array([1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
cars_data = pd.read_csv("../data/cars_about.csv").dropna()

cars_data["brand"] = cars_data.car_model.apply(lambda a: a.split()[0])
cars_data = cars_data.drop(columns=["car_model", "engine", "car_id"])
cars_data["price"] = cars_data['price'].apply(lambda a: int(a[1:]))

cars_data['used_label'] = cars_data["used_label"].apply(lambda a: 1 if a=="Used" else 0)
# Identify the categorical columns
car_cat_cols = ['exteriorColor', 'interiorColor', 'drivetrain', 'fuelType', 'transmission', 'brand']

# Convert the categorical columns to numerical using Label Encoding
for col in car_cat_cols:
    le = LabelEncoder()
    cars_data[col] = le.fit_transform(cars_data[col])

cars_data


column_names = []
cat_columns = ['used_label', 'exteriorColor', 'interiorColor', 'drivetrain', 'fuelType', 'transmission', 'brand']
cat_features = []

for index in range(N_POSITIVE):
    for column in cars_data.columns:
        column_names.append(f"car_{index}_{column}")
        
for column in cars_data.columns:
    column_names.append(f"target_{column}")

for column in column_names:
    for c_column in cat_columns:
        if c_column in column:
            cat_features.append(column)

true_table = np.zeros((actions_data.user_id.nunique(), actions_data.car_id.unique().max()))
pred_table = true_table.copy()

dataset = pd.DataFrame(columns=column_names + ["interaction"])

for user_index, user_id in enumerate(actions_data.user_id.unique()):
    for target_car_index, target_car_id in enumerate(actions_data.car_id.unique()):
        # len(cars_data.columns) * (N_POSITIVE + 1))
        matrix_features = []
        features = []
        current_positive_samples = actions_data.query(f"user_id == {user_id}").query(f"car_id != {target_car_id}").query("interaction == 1")
        # print()
        if len(current_positive_samples) < N_POSITIVE: continue

        for car_id in current_positive_samples.sample(N_POSITIVE).car_id.to_list():
            matrix_features.append(cars_data.iloc[car_id].to_list())
        matrix_features.append(cars_data.iloc[target_car_id].to_list())
        matrix_features.append(int(actions_data.loc[actions_data.car_id == target_car_id].query(f"user_id == {user_id}").interaction))
        
        for item in matrix_features:
            if isinstance(item, list):
                for elem in item:
                    features.append(elem)
            else:
                features.append(item)

        true_table[user_index, target_car_id-1] = features[-1]
        pred_table[user_index, target_car_id-1] = model.predict(
            pd.DataFrame.from_records(
            [dict(zip(dataset.columns[:-1], features[:-1]))]
        ).astype(int))

        dataset = pd.concat([
            dataset, 
            pd.DataFrame.from_records(
            [dict(zip(dataset.columns, features))]
        )])

dataset = dataset.astype(int)

: 

In [961]:
px.imshow(true_table, width=1600, height=400)

In [962]:
px.imshow(pred_table, width=1600, height=400)

In [947]:
catpred.predict()


iteritems is deprecated and will be removed in a future version. Use .items instead.



array([0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,