In [2]:
!pip install lightfm
!pip install pandas
!pip install kaggle
!kaggle datasets download -d rdoume/beerreviews
!unzip beerreviews.zip

Dataset URL: https://www.kaggle.com/datasets/rdoume/beerreviews
License(s): unknown
beerreviews.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  beerreviews.zip
replace beer_reviews.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: beer_reviews.csv        


In [3]:
from scipy.sparse import coo_matrix
from google.colab import files
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import pandas as pd
import time
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [4]:
data = pd.read_csv('beer_reviews.csv')

In [5]:
data = data.dropna(subset=["review_profilename", "beer_name", "beer_beerid"])
data = data[["review_profilename", "beer_beerid", "beer_name", "review_overall"]]

In [17]:
data['id'] = range(1, len(data) + 1)

In [7]:
unique_users = data['review_profilename'].unique()
num_users = len(unique_users)
user_features = pd.DataFrame({
    'review_profilename': unique_users,
    'age': np.random.randint(18, 65, size=num_users),
    'gender': np.random.choice(['М', 'Ж'], size=num_users),
    'location': np.random.choice(['Москва', 'Питер', 'Казань', 'Белгород'], size=num_users),
})

In [8]:
data = pd.merge(data, user_features, on='review_profilename')

In [9]:
le_beer = LabelEncoder()
data['beer_beerid_encoded'] = le_beer.fit_transform(data['beer_beerid'])

In [10]:
X = data[['age', 'gender', 'location']]
X = pd.get_dummies(X, columns=['gender', 'location'])
y = data['beer_beerid_encoded']

In [11]:
subset_size = 1000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_small, y_train_small = shuffle(X_train, y_train, random_state=42)
X_train_subset = X_train_small[:subset_size]
y_train_subset = y_train_small[:subset_size]

In [12]:
scaler = StandardScaler()
X_train_subset = scaler.fit_transform(X_train_subset)

In [13]:
min_samples = 3
class_counts = Counter(y_train_subset)
valid_classes = {cls for cls, count in class_counts.items() if count >= min_samples}

X_train_filtered = []
y_train_filtered = []

for x, y in zip(X_train_subset, y_train_subset):
    if y in valid_classes:
        X_train_filtered.append(x)
        y_train_filtered.append(y)

X_train_filtered = np.array(X_train_filtered)
y_train_filtered = np.array(y_train_filtered)

print("Распределение классов после фильтрации:", Counter(y_train_filtered))

Распределение классов после фильтрации: Counter({2877: 5, 22886: 4, 874: 4, 9546: 4, 1166: 3, 1003: 3, 115: 3, 1819: 3, 692: 3, 92: 3, 934: 3, 1783: 3, 91: 3, 1621: 3, 24513: 3, 64: 3, 179: 3, 714: 3, 400: 3, 312: 3, 27365: 3, 44810: 3, 2898: 3, 666: 3, 1233: 3})


In [14]:
model = GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42)

model.fit(X_train_filtered, y_train_filtered)

y_pred = model.predict(X_train_filtered)

# Считаем точность
accuracy = accuracy_score(y_train_filtered, y_pred)
print(f"Точность модели: {accuracy:.4f}")


Точность модели: 0.9125


In [30]:
print("Введите ваши данные:")
try:
    user_age = int(input("Ваш возраст: "))
    user_gender = input("Ваш пол (м/ж): ").strip().lower()
    user_location = input("Ваш город: ").strip()

    # Проверка пола
    if user_gender == 'м':
        user_gender = 'Male'
    elif user_gender == 'ж':
        user_gender = 'Female'
    else:
        print("Ошибка: Некорректный ввод пола. Используйте 'м' или 'ж'.")
        exit()

    # Проверка города
    if user_location not in ['USA', 'Canada', 'UK', 'Germany']:
        print("Ошибка: Город должен быть одним из ['USA', 'Canada', 'UK', 'Germany'].")
        exit()

    user_data = pd.DataFrame([{
        'age': user_age,
        'gender': user_gender,
        'location': user_location,
    }])
    user_data = pd.get_dummies(user_data, columns=['gender', 'location'])
    user_data = user_data.reindex(columns=X.columns, fill_value=0)

    predictions_proba = model.predict_proba(user_data)[0]

    top_5_indices = predictions_proba.argsort()[-5:][::-1]
    top_5_beers = le_beer.inverse_transform(top_5_indices)

    beer_names_dict = {id_: f"Пиво {id_}" for id_ in top_5_beers}

    print("Мы рекомендуем вам попробовать следующие 5 сортов пива:")
    for beer_id in top_5_beers:
        beer_name = data.loc[beer_id, 'beer_name'] if beer_id in data.index else "Неизвестное пиво"
        print(f"- {beer_name} (ID: {beer_id})")
except ValueError:
    print("Ошибка: Пожалуйста, введите числовое значение для возраста.")


Введите ваши данные:
Ваш возраст: 19
Ваш пол (м/ж): ж
Ваш город: USA
Мы рекомендуем вам попробовать следующие 5 сортов пива:
- Rauch Ür Bock (ID: 24)
- Rauch Ür Bock (ID: 23)
- Caldera OBF 15 (ID: 17)
- Caldera Ginger Beer (ID: 13)
- Cauldron DIPA (ID: 4)




In [32]:
from google.colab import files

files.download('model.joblib')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>