In [1]:
# Импорты
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
import optuna
import ast 
from itertools import chain
from lightfm.evaluation import precision_at_k, auc_score, recall_at_k
from lightfm.cross_validation import random_train_test_split
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix
import re

# Подготовка данных

In [2]:
df = pd.read_excel('data.xlsx').drop('Unnamed: 0', axis=1) # Пользователи
city_f = pd.read_excel('ready_cities.xlsx') # Города
allowed_cities = city_f['city'].unique().tolist()

# Проверка, что все города из списка
def filter_cities(city_string):
    if not isinstance(city_string, str):
        return ''
    cities = [city.strip() for city in city_string.split(', ')]
    filtered = [city for city in cities if city in allowed_cities]
    return ', '.join(filtered)


df['cities_5'] = df['cities_5'].apply(filter_cities)
df['cities_4'] = df['cities_4'].apply(filter_cities)
df['cities_3'] = df['cities_3'].apply(filter_cities)
df['cities_2'] = df['cities_2'].apply(filter_cities)
df['cities_1'] = df['cities_1'].apply(filter_cities)
df['izbrannoe'] = df['izbrannoe'].apply(filter_cities)
df['cities_prosmotr_more_2'] = df['cities_prosmotr_more_2'].apply(filter_cities)
df['cities_prosmotr_less_2'] = df['cities_prosmotr_less_2'].apply(filter_cities)

In [3]:
# Возраст преобразуем в категорию
def age_group(age_str):
    if age_str.split(' ')[0] == 'Более':
        return 'senior'
    age = int(age_str.split(' - ')[0])  
    if age == 18:
        return 'teen'
    elif age == 21:
        return 'young_adult'
    elif age == 31:
        return 'adult'

In [4]:
df['age_group'] = df['age_group'].map(age_group)
df['gender'] = df['gender'].astype(str)

In [5]:
df.head()

Unnamed: 0,gender,age_group,cities_5,cities_4,cities_3,cities_2,cities_1,izbrannoe,cities_prosmotr_more_2,cities_prosmotr_less_2,...,morskie_kruizy,plyazhnyj_otdykh,s_detmi,s_kompaniej_15_24,s_kompaniej_25_44,s_kompaniej_45_66,s_semej,v_odinochku,paroj,kuhnya
0,Женский,teen,"санкт-петербург, нижний новгород, екатеринбург...","казань, тюмень",,,,,,,...,False,False,True,True,False,False,False,False,False,
1,Мужской,teen,москва,,,,,,,,...,False,True,False,True,False,False,True,False,False,
2,Женский,teen,"санкт-петербург, суздаль, владимир, москва, ту...",,,,воронеж,,,,...,False,True,False,True,False,False,True,False,False,
3,Мужской,teen,"волгоград, краснодар, сочи",,,,санкт-петербург,,,,...,False,True,False,True,False,False,False,False,False,
4,Женский,teen,"санкт-петербург, нижний новгород",,,,"краснодар, рыбинск, углич, кострома",,,,...,False,False,True,True,False,False,False,False,False,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 32 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 88 non-null     object 
 1   age_group                              88 non-null     object 
 2   cities_5                               88 non-null     object 
 3   cities_4                               88 non-null     object 
 4   cities_3                               88 non-null     object 
 5   cities_2                               88 non-null     object 
 6   cities_1                               88 non-null     object 
 7   izbrannoe                              88 non-null     object 
 8   cities_prosmotr_more_2                 88 non-null     object 
 9   cities_prosmotr_less_2                 88 non-null     object 
 10  poznavatelnyj_kulturno_razvlekatelnyj  88 non-null     bool   
 11  delovoy 

In [7]:
# Население и год основания в категорию
def population_group(pop):
    if pop is None:
        return ''
    if pop <= 10000:
        return 'very small'
    if pop <= 50000:
        return 'small'
    elif pop <= 100000:
        return 'medium'
    elif pop <= 500000:
        return 'high medium'
    elif pop <= 1000000:
        return 'high'
    else:
        return 'very high'
def foundation_year(year):
    if year is None:
        return ''
    if year <= 1500:
        return 'very old'
    if year <= 1700:
        return 'medium'
    elif year <= 1900:
        return 'high medium'
    else:
        return 'new'
city_f['population'] = city_f['population'].map(population_group)
city_f['foundation_year'] = city_f['foundation_year'].map(foundation_year)
city_f.head()

Unnamed: 0,city,federal_district,region,fias_level,capital_marker,population,foundation_year,features
0,адыгейск,Южный,Адыгея,4,0,small,new,"семейный отдых, курортный, экологический"
1,майкоп,Южный,Адыгея,4,2,high medium,high medium,"семейный отдых, экологический, исторический"
2,горно-алтайск,Сибирский,Алтай,4,2,medium,high medium,"семейный отдых, исторический, экзотический"
3,алейск,Сибирский,Алтайский,4,0,small,new,"промышленный, семейный отдых, экологический"
4,барнаул,Сибирский,Алтайский,4,2,high,high medium,"промышленный, экзотический, исторический"


In [8]:
city_f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   city              1095 non-null   object
 1   federal_district  1095 non-null   object
 2   region            1095 non-null   object
 3   fias_level        1095 non-null   int64 
 4   capital_marker    1095 non-null   int64 
 5   population        1095 non-null   object
 6   foundation_year   1095 non-null   object
 7   features          1095 non-null   object
dtypes: int64(2), object(6)
memory usage: 68.6+ KB


# Подготовка модели

В модель нужно передать: города, признаки городов, пользователи, признаки пользователей

In [9]:
dataset = Dataset()
all_users = df.index.astype(str).tolist() # Пользователи
all_items = [i for i in city_f['city'].unique().tolist() if i != ''] # Города
all_user_features = df.columns[4:-1].tolist() 
all_user_features += ['gender'] # Признаки пользователей

In [10]:
all_city_features = [
    'исторический', 'курортный', 'лыжный', 'промышленный', 'религиозный', 
    'студенческий', 'туристический', 'торговый', 'военный', 'сельскохозяйственный', 
    'медицинский', 'IT и технологический', 'экологический', 'семейный отдых', 
    'экзотический', 'спортивный', 'большой', 'средний', 'маленький'
]
all_city_features.extend(city_f['federal_district'].unique().tolist())
all_city_features.extend(city_f['region'].unique().tolist()) 
all_city_features.extend([f'fias_level: {i}' for i in city_f['fias_level'].unique().tolist()])
all_city_features.extend([f'capital_marker: {i}' for i in city_f['capital_marker'].unique().tolist()])
all_city_features.extend([f'population: {i}' for i in city_f['population'].unique().tolist()])
all_city_features.extend([f'foundation_year: {i}' for i in city_f['foundation_year'].unique().tolist()]) # Признаки городов

In [16]:
dataset.fit( # Создание датасета
    users=all_users,
    items=all_items,
    user_features=all_user_features,
    item_features=all_city_features
)

In [22]:
def get_user_features(user_id): # Добавляем учитывания признака пользователей
    row = df.loc[user_id]
    return [feat for feat in all_user_features if row[feat] == True]

user_features = dataset.build_user_features(
    ((str(user_id), get_user_features(user_id)) for user_id in df.index)
)

In [25]:
(interactions, weights) = dataset.build_interactions( # Добавляем взаимодействия
    chain(((str(user_id), city.strip(), 1) for user_id, cities in df['cities_5'].items() for city in cities.split(',') if city != ''),
    ((str(user_id), city.strip(), 0.8) for user_id, cities in df['cities_4'].items() for city in cities.split(',') if city != ''),
    ((str(user_id), city.strip(), 0.6) for user_id, cities in df['cities_3'].items() for city in cities.split(',') if city != ''),     
    ((str(user_id), city.strip(), 0.4) for user_id, cities in df['cities_2'].items() for city in cities.split(',') if city != ''),     
    ((str(user_id), city.strip(), 0.2) for user_id, cities in df['cities_1'].items() for city in cities.split(',') if city != ''),
    ((str(user_id), city.strip(), 0.9) for user_id, cities in df['izbrannoe'].items() for city in cities.split(',') if city != ''),
    ((str(user_id), city.strip(), 0.8) for user_id, cities in df['cities_prosmotr_more_2'].items() for city in cities.split(',') if city != ''),
    ((str(user_id), city.strip(), 0.4) for user_id, cities in df['cities_prosmotr_less_2'].items() for city in cities.split(',') if city != '')
    )
)

In [26]:
city_feature_tuples = [] # Учитываем признаки городов
for _, row in city_f.iterrows():
    features = [
        row['federal_district'],
        row['region'],
        f"fias_level: {row['fias_level']}",
        f"capital_marker: {row['capital_marker']}",
        f"population: {row['population']}",
        f"foundation_year: {row['foundation_year']}"
    ]
    city_features = [f.strip() for f in str(row['features']).split(",") if f != 'unknown']
    features.extend(city_features)

    city_feature_tuples.append((row['city'], features))

In [27]:
item_features = dataset.build_item_features(city_feature_tuples)

In [30]:
TEST_SIZE = 0.2  
RANDOM_STATE = 42

In [31]:
train_interactions, test_interactions = random_train_test_split(
    interactions,
    test_percentage=TEST_SIZE,
    random_state=RANDOM_STATE
)

In [32]:
import optuna # Перебираем гиперпараметры
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

def objective(trial):
    # Поиск по гиперпараметрам
    no_components = trial.suggest_categorical("no_components", [10, 20, 30, 50, 70, 100])
    loss = trial.suggest_categorical("loss", ["warp", "bpr", "logistic"])
    learning_schedule = trial.suggest_categorical("learning_schedule", ["adagrad", "adadelta"])
    learning_rate = trial.suggest_float("learning_rate", 0.005, 0.1, log=True)
    user_alpha = trial.suggest_float("user_alpha", 1e-6, 1e-4, log=True)
    item_alpha = trial.suggest_float("item_alpha", 1e-6, 1e-4, log=True)
    epochs = trial.suggest_int("epochs", 10, 30)

    # Модель
    model = LightFM(
        no_components=no_components,
        loss=loss,
        learning_schedule=learning_schedule,
        learning_rate=learning_rate,
        user_alpha=user_alpha,
        item_alpha=item_alpha,
        random_state=RANDOM_STATE
    )

    # Обучение
    model.fit(
        train_interactions,
        user_features=user_features,
        item_features=item_features,
        epochs=epochs,
        num_threads=4,
        verbose=False
    )

    # Метрика — Precision@5
    precision = precision_at_k(
        model,
        test_interactions,
        user_features=user_features,
        item_features=item_features,
        k=5
    ).mean()

    return precision

# Запуск Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)  # можно увеличить до 100–200 для лучших результатов

# Результаты
print("\nЛучшие параметры:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")

print(f"\nЛучший precision@5: {study.best_value:.4f}")

[I 2025-06-22 12:32:12,590] A new study created in memory with name: no-name-cd7371f3-016a-4985-adf7-86870b55f386
[I 2025-06-22 12:32:15,282] Trial 0 finished with value: 0.05714286118745804 and parameters: {'no_components': 30, 'loss': 'bpr', 'learning_schedule': 'adadelta', 'learning_rate': 0.02179133908866127, 'user_alpha': 1.0300101582875176e-05, 'item_alpha': 9.831986300655612e-05, 'epochs': 23}. Best is trial 0 with value: 0.05714286118745804.
[I 2025-06-22 12:32:17,376] Trial 1 finished with value: 0.009523809887468815 and parameters: {'no_components': 30, 'loss': 'logistic', 'learning_schedule': 'adadelta', 'learning_rate': 0.012131546359319154, 'user_alpha': 1.0804308042338813e-06, 'item_alpha': 1.318993610000054e-06, 'epochs': 21}. Best is trial 0 with value: 0.05714286118745804.
[I 2025-06-22 12:32:19,677] Trial 2 finished with value: 0.10952381044626236 and parameters: {'no_components': 70, 'loss': 'warp', 'learning_schedule': 'adadelta', 'learning_rate': 0.0148745477322139


Лучшие параметры:
no_components: 70
loss: warp
learning_schedule: adagrad
learning_rate: 0.07532001740060823
user_alpha: 1.696934838853893e-05
item_alpha: 7.26743364143695e-05
epochs: 29

Лучший precision@5: 0.1571


# Обучение

In [39]:
# Создаём лучшую модель по результатам Optuna
best_model = LightFM(
    no_components=study.best_params['no_components'],
    loss=study.best_params['loss'],
    learning_schedule=study.best_params['learning_schedule'],
    learning_rate=study.best_params['learning_rate'],
    user_alpha=study.best_params['user_alpha'],
    item_alpha=study.best_params['item_alpha'],
    random_state=RANDOM_STATE
)

# Обучение
best_model.fit(
    interactions,
    user_features=user_features,
    item_features=item_features,
    epochs=study.best_params['epochs'],
    num_threads=4,
    verbose=True
)

Epoch: 100%|██████████| 29/29 [00:03<00:00,  8.32it/s]


<lightfm.lightfm.LightFM at 0x70be4eb84f90>

In [40]:
def recommend(user_id, model, n=5): # Получаем ТОП-5 рекомендаций
    user_index = list(df.index).index(user_id)
    known_cities = set(df.loc[user_id, 'cities_prosmotr_more_2'].split(', ') + df.loc[user_id, 'cities_prosmotr_less_2'].split(', ') + df.loc[user_id, 'izbrannoe'].split(', ') + df.loc[user_id, 'cities_5'].split(', ') + df.loc[user_id, 'cities_4'].split(', ') + df.loc[user_id, 'cities_3'].split(', ') + df.loc[user_id, 'cities_2'].split(', ') + df.loc[user_id, 'cities_1'].split(', '))
    city_list = list(dataset.mapping()[2].keys())
    
    scores = model.predict(user_ids=user_index,
                           item_ids=np.arange(len(city_list)),
                           user_features=user_features,
                           item_features=item_features)
    
    city_scores = list(zip(city_list, scores))
    city_scores = sorted(city_scores, key=lambda x: -x[1])
    recommendations = [city for city, score in city_scores if city not in known_cities][:n]
    
    return recommendations

In [41]:
df['recommendations'] = df.index.map(lambda user_id: recommend(user_id, best_model))
recommendations_for_all = {user_id: recommend(user_id, best_model) for user_id in df.index}

# Результат

In [42]:
df.head()

Unnamed: 0,gender,age_group,cities_5,cities_4,cities_3,cities_2,cities_1,izbrannoe,cities_prosmotr_more_2,cities_prosmotr_less_2,...,plyazhnyj_otdykh,s_detmi,s_kompaniej_15_24,s_kompaniej_25_44,s_kompaniej_45_66,s_semej,v_odinochku,paroj,kuhnya,recommendations
0,Женский,teen,"санкт-петербург, нижний новгород, екатеринбург...","казань, тюмень",,,,,,,...,False,True,True,False,False,False,False,False,,"[сочи, владимир, анапа, волгоград, челябинск]"
1,Мужской,teen,москва,,,,,,,,...,True,False,True,False,False,True,False,False,,"[санкт-петербург, нижний новгород, казань, соч..."
2,Женский,teen,"санкт-петербург, суздаль, владимир, москва, ту...",,,,воронеж,,,,...,True,False,True,False,False,True,False,False,,"[нижний новгород, казань, сочи, анапа, екатери..."
3,Мужской,teen,"волгоград, краснодар, сочи",,,,санкт-петербург,,,,...,True,False,True,False,False,False,False,False,,"[москва, нижний новгород, казань, анапа, екате..."
4,Женский,teen,"санкт-петербург, нижний новгород",,,,"краснодар, рыбинск, углич, кострома",,,,...,False,True,True,False,False,False,False,False,,"[москва, казань, сочи, анапа, владимир]"
