In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from sklearn.metrics import roc_auc_score, accuracy_score

  from pandas.core import (


In [2]:
data_path = "data/"
sales = pd.read_csv(data_path + 'sales_train.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
shops = pd.read_csv(data_path + 'shops.csv')
test = pd.read_csv(data_path + 'test.csv')

In [3]:
#предварительная обработка данных
def preprocess_data_for_users(sales, items):
    # Add a dummy user_id (for demonstration)
    sales['user_id'] = sales['shop_id'] % 100  # Simulated user ID for example

    # Aggregate user purchase history
    user_history = sales.groupby(['user_id', 'item_id']).agg({
        'item_cnt_day': 'sum'
    }).reset_index()
    user_history.rename(columns={'item_cnt_day': 'total_purchases'}, inplace=True)

    # Normalize purchases
    user_history['total_purchases'] = MinMaxScaler().fit_transform(
        user_history[['total_purchases']]
    )

    # Merge with item details
    user_history = user_history.merge(items, on='item_id', how='left')
    return user_history


user_history = preprocess_data_for_users(sales, items)

In [4]:
#создание матрицы взаимодействия пользователя с товаром
def create_user_item_matrix(user_history):
    user_item_matrix = user_history.pivot(index='user_id', columns='item_id', values='total_purchases').fillna(0)
    return user_item_matrix

user_item_matrix = create_user_item_matrix(user_history)


In [5]:
#нейронная сеть для рекомендаций
def build_recommendation_model(num_users, num_items):
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')

    user_embedding = Embedding(input_dim=num_users, output_dim=50, name='user_embedding')(user_input)
    item_embedding = Embedding(input_dim=num_items, output_dim=50, name='item_embedding')(item_input)

    user_flatten = Flatten()(user_embedding)
    item_flatten = Flatten()(item_embedding)

    concat = Concatenate()([user_flatten, item_flatten])

    dense1 = Dense(128, activation='relu')(concat)
    dropout1 = Dropout(0.2)(dense1)
    dense2 = Dense(64, activation='relu')(dropout1)
    dropout2 = Dropout(0.2)(dense2)
    output = Dense(1, activation='sigmoid')(dropout2)

    model = Model(inputs=[user_input, item_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [6]:
#подготовка данных для обучения
user_ids = user_item_matrix.index.values
item_ids = user_item_matrix.columns.values

user_idx = {user: idx for idx, user in enumerate(user_ids)}
item_idx = {item: idx for idx, item in enumerate(item_ids)}

user_item_pairs = user_item_matrix.stack().reset_index()
user_item_pairs.columns = ['user_id', 'item_id', 'interaction']
user_item_pairs['user_id'] = user_item_pairs['user_id'].map(user_idx)
user_item_pairs['item_id'] = user_item_pairs['item_id'].map(item_idx)

X_users = user_item_pairs['user_id'].values
X_items = user_item_pairs['item_id'].values
y_interactions = user_item_pairs['interaction'].values

In [7]:
#переводим X_users и X_items в 2d
X_combined = np.column_stack((X_users, X_items))

X_train, X_valid, y_train, y_valid = train_test_split(
    X_combined, y_interactions, test_size=0.2, random_state=42
)

In [8]:
X_train_users, X_train_items = X_train[:, 0], X_train[:, 1]
X_valid_users, X_valid_items = X_valid[:, 0], X_valid[:, 1]

In [9]:
# build and train rec. model
num_users = len(user_ids)
num_items = len(item_ids)
model = build_recommendation_model(num_users, num_items)

history = model.fit(
    x=[X_train_users, X_train_items],
    y=y_train,
    validation_data=([X_valid_users, X_valid_items], y_valid),
    epochs=10,
    batch_size=256,
    verbose=1
)


Epoch 1/10
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 8ms/step - accuracy: 0.6751 - loss: 0.0243 - val_accuracy: 0.6743 - val_loss: 0.0027
Epoch 2/10
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 7ms/step - accuracy: 0.6757 - loss: 0.0028 - val_accuracy: 0.6743 - val_loss: 0.0027
Epoch 3/10
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 8ms/step - accuracy: 0.6767 - loss: 0.0028 - val_accuracy: 0.6743 - val_loss: 0.0027
Epoch 4/10
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 7ms/step - accuracy: 0.6761 - loss: 0.0028 - val_accuracy: 0.6743 - val_loss: 0.0027
Epoch 5/10
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 8ms/step - accuracy: 0.6766 - loss: 0.0027 - val_accuracy: 0.6743 - val_loss: 0.0027
Epoch 6/10
[1m4089/4089[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 7ms/step - accuracy: 0.6764 - loss: 0.0027 - val_accuracy: 0.6743 - val_loss: 0.0027
Epoch 7/10

In [12]:
#генерируем рекомендацию для пользователя
def recommend_for_user(user_id, model, item_ids, user_idx, item_idx, num_recommendations=5):
    user_vector = np.array([user_idx[user_id]] * len(item_ids))
    item_vector = np.array([item_idx[item] for item in item_ids])

    predictions = model.predict([user_vector, item_vector]).flatten()
    recommendations = pd.DataFrame({
        'item_id': item_ids,
        'predicted_score': predictions
    }).sort_values(by='predicted_score', ascending=False)

    return recommendations.head(num_recommendations)

In [13]:
# пример: рекомендация для польщователя 10
user_id = 10
recommendations = recommend_for_user(user_id, model, item_ids, user_idx, item_idx)
print(recommendations)


[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step  
       item_id  predicted_score
20602    20949         0.049245
3654      3732         0.006954
2749      2808         0.006743
17418    17717         0.006284
3653      3731         0.005556


In [14]:
from sklearn.metrics import roc_auc_score, accuracy_score

# Предсказания на валидационном наборе
y_valid_pred = model.predict([X_valid_users, X_valid_items]).flatten()

[1m8178/8178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 883us/step


In [15]:
# Бинаризация y_valid, если значения не в {0, 1}
y_valid_binary = (y_valid > 0).astype(int)

# Рассчёт ROC-AUC
roc_auc = roc_auc_score(y_valid_binary, y_valid_pred)
print(f"ROC-AUC: {roc_auc:.4f}")

ROC-AUC: 0.9382


In [16]:
item_mapping = dict(zip(items['item_id'], items['item_name']))

# Функция для получения названия товара
def get_item_name(item_id, item_mapping):
    return item_mapping.get(item_id, "Unknown Item")

# Пример: Выводим названия товаров из рекомендаций
recommendations['item_name'] = recommendations['item_id'].apply(lambda x: get_item_name(x, item_mapping))
print(recommendations[['item_id', 'item_name', 'predicted_score']])

       item_id                                          item_name  \
20602    20949  Фирменный пакет майка 1С Интерес белый (34*42)...   
3654      3732         Grand Theft Auto V [PS3, русские субтитры]   
2749      2808             Diablo III [PC, Jewel, русская версия]   
17418    17717               Прием денежных средств для 1С-Онлайн   
3653      3731          Grand Theft Auto V [PC, русские субтитры]   

       predicted_score  
20602         0.049245  
3654          0.006954  
2749          0.006743  
17418         0.006284  
3653          0.005556  


In [17]:
model.save('model_rec_user.h5')

