In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import random


In [23]:
n_samples = 1000
random.seed(42)

data = {
    "salary": [random.randint(30000, 150000) for _ in range(n_samples)],
    "city": [random.choice(["Bishkek", "Osh", "Almaty", "Astana"]) for _ in range(n_samples)],
    "age": [random.randint(18, 65) for _ in range(n_samples)],
    "vacation_prefer": [random.choice(["Shopping", "Beach holiday", "Cultural tour"]) for _ in range(n_samples)],
    "transport_prefer": [random.choice(["auto", "plane", "train"]) for _ in range(n_samples)],
    "target": [random.choice(["London", "Moscow", "Dubai", "New York"]) for _ in range(n_samples)],
}

df = pd.DataFrame(data)

df.head()


Unnamed: 0,salary,city,age,vacation_prefer,transport_prefer,target
0,113810,Osh,63,Cultural tour,plane,London
1,44592,Bishkek,22,Beach holiday,auto,Dubai
2,33278,Osh,53,Shopping,plane,Dubai
3,127196,Bishkek,52,Cultural tour,auto,Dubai
4,66048,Almaty,36,Beach holiday,train,Dubai


In [24]:
df_encoded = pd.get_dummies(df, columns=["city", "vacation_prefer", "transport_prefer", "target"])

X = df_encoded.drop(columns=["target_London", "target_Moscow", "target_Dubai", "target_New York"])
y = df_encoded[["target_London", "target_Moscow", "target_Dubai", "target_New York"]].idxmax(axis=1)

print(X.head())
print(y.head())


   salary  age  city_Almaty  city_Astana  city_Bishkek  city_Osh  \
0  113810   63        False        False         False      True   
1   44592   22        False        False          True     False   
2   33278   53        False        False         False      True   
3  127196   52        False        False          True     False   
4   66048   36         True        False         False     False   

   vacation_prefer_Beach holiday  vacation_prefer_Cultural tour  \
0                          False                           True   
1                           True                          False   
2                          False                          False   
3                          False                           True   
4                           True                          False   

   vacation_prefer_Shopping  transport_prefer_auto  transport_prefer_plane  \
0                     False                  False                    True   
1                     False     

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Размер обучающей выборки: {X_train.shape}, Размер тестовой выборки: {X_test.shape}")


Размер обучающей выборки: (800, 12), Размер тестовой выборки: (200, 12)


In [26]:
clf = RandomForestClassifier(random_state=42)

cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Средняя правильность перекрестной проверки: {np.mean(cv_scores):.2f}")


Средняя правильность перекрестной проверки: 0.24


In [27]:
param_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
}

grid_search = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучшая правильность перекрестной проверки: {grid_search.best_score_:.2f}")


Лучшие параметры: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Лучшая правильность перекрестной проверки: 0.25


In [28]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Отчет классификации:")
print(classification_report(y_test, y_pred))


Отчет классификации:
                 precision    recall  f1-score   support

   target_Dubai       0.18      0.20      0.19        50
  target_London       0.25      0.23      0.24        47
  target_Moscow       0.27      0.32      0.29        50
target_New York       0.24      0.19      0.21        53

       accuracy                           0.23       200
      macro avg       0.24      0.24      0.23       200
   weighted avg       0.24      0.23      0.23       200



In [29]:
random_sample = X_test.sample(1, random_state=42)
predicted_target = best_model.predict(random_sample)

print(f"Случайный пример: {random_sample}")
print(f"Предсказание для случайного примера: {predicted_target}")


Случайный пример:      salary  age  city_Almaty  city_Astana  city_Bishkek  city_Osh  \
436  107924   49        False        False         False      True   

     vacation_prefer_Beach holiday  vacation_prefer_Cultural tour  \
436                          False                          False   

     vacation_prefer_Shopping  transport_prefer_auto  transport_prefer_plane  \
436                      True                  False                    True   

     transport_prefer_train  
436                   False  
Предсказание для случайного примера: ['target_Dubai']
