In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# импортируем библиотеки для визуализации
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# Загружаем специальный удобный инструмент для разделения датасета:
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# всегда фиксируйте RANDOM_SEED, чтобы ваши эксперименты были воспроизводимы!
RANDOM_SEED = 42

In [None]:
# зафиксируем версию пакетов, чтобы эксперименты были воспроизводимы:
!pip freeze > requirements.txt

In [None]:
# Подгрузим наши данные из соревнования

DATA_DIR = '/kaggle/input/sf-booking/'
df_train = pd.read_csv(DATA_DIR+'/hotels_train.csv') # датасет для обучения
df_test = pd.read_csv(DATA_DIR+'hotels_test.csv') # датасет для предсказания
sample_submission = pd.read_csv(DATA_DIR+'/submission.csv') # самбмишн

In [None]:
df_train.info()

In [None]:
df_train.head(2)

In [None]:
df_test.info()

In [None]:
df_test.head(2)

In [None]:
sample_submission.head(2)

In [None]:
sample_submission.info()

In [None]:
df_duplicate=df_train.duplicated()
print(f'{df_train[df_duplicate].shape[0]} duplicates')

In [None]:
df_train=df_train.drop_duplicates()
df_train.info()

In [None]:
# ВАЖНО! дря корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0 # помечаем где у нас тест
df_test['reviewer_score'] = 0 # в тесте у нас нет значения reviewer_score, мы его должны предсказать, по этому пока просто заполняем нулями

data = df_test.append(df_train, sort=False).reset_index(drop=True) # объединяем

### Очистка от пропущенных значений

In [None]:
data.info()

In [None]:
for col in data.columns:
    if data[col].isna().sum()>0:
        num=data[col].isna().sum()
        print(f'{col}: {num} NA values')

In [None]:
nan_data = data[data['lat'].isna()].groupby(['hotel_name', 'hotel_address'], as_index=False)[['lat', 'lng']].count()
nan_data[['lat', 'lng']]=nan_data[['lat', 'lng']].replace(0, np.nan)
nan_data

In [None]:
# from opencage source

geocode_lat={'Austria Trend Hotel Schloss Wilhelminenberg Wien': 48.2167,
 'City Hotel Deutschmeister': 48.2333,
 'Cordial Theaterhotel Wien': 48.2167,
 'Derag Livinghotel Kaiser Franz Joseph Vienna': 48.2068,
 'Fleming s Selection Hotel Wien City': 48.2167,
 'Holiday Inn Paris Montmartre': 48.884949,
 'Hotel Advance': 41.3888,
 'Hotel Atlanta': 48.2333,
 'Hotel City Central': 48.2167,
 'Hotel Daniel Vienna': 48.1981,
 'Hotel Park Villa': 48.2068,
 'Hotel Pension Baron am Schottentor': 48.2333,
 'Maison Albar Hotel Paris Op ra Diamond': 48.875257,
 'Mercure Paris Gare Montparnasse': 48.824296,
 'NH Collection Barcelona Podium': 41.3888,
 'Renaissance Barcelona Hotel': 41.3888,
 'Roomz Vienna': 48.2068}


geocode_lng={'Austria Trend Hotel Schloss Wilhelminenberg Wien': 16.3,
 'City Hotel Deutschmeister': 16.35,
 'Cordial Theaterhotel Wien': 16.35,
 'Derag Livinghotel Kaiser Franz Joseph Vienna': 16.2646,
 'Fleming s Selection Hotel Wien City': 16.35,
 'Holiday Inn Paris Montmartre': 2.353604,
 'Hotel Advance': 2.159,
 'Hotel Atlanta': 16.35,
 'Hotel City Central': 16.4,
 'Hotel Daniel Vienna': 16.3948,
 'Hotel Park Villa': 16.2646,
 'Hotel Pension Baron am Schottentor': 16.35,
 'Maison Albar Hotel Paris Op ra Diamond': 2.323375,
 'Mercure Paris Gare Montparnasse': 2.305834,
 'NH Collection Barcelona Podium': 2.159,
 'Renaissance Barcelona Hotel': 2.159,
 'Roomz Vienna': 16.2646}

In [None]:
data['lat']=data['lat'].fillna(data['hotel_name'].map(geocode_lat))
data['lng']=data['lng'].fillna(data['hotel_name'].map(geocode_lng))
data.info()

In [None]:
data['review_date']=pd.to_datetime(data['review_date'], format='%m/%d/%Y')
data['review_month']=data['review_date'].dt.month
data['review_weekday']=data['review_date'].dt.weekday

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

data['season'] = data['review_month'].apply(get_season)
data.drop(['review_date'], axis=1, inplace=True)

In [None]:
month_names = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December']

weekday_dict = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday"}


data['review_month']=data['review_month'].apply(lambda x: month_names[x - 1])
data['review_weekday']=data['review_weekday'].apply(lambda x: weekday_dict[x])

In [None]:
hotels_encoded = pd.get_dummies(data, columns=['review_month', 'review_weekday', 'season'], drop_first=True)

In [None]:
hotels_encoded.head(3)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))  # Set width to 12 and height to 6
hotels_encoded['reviewer_nationality'].value_counts().head(15).plot(kind='bar')
plt.show()


In [None]:

import category_encoders as ce



# Initialize Binary Encoder
encoder = ce.BinaryEncoder(cols=['reviewer_nationality'])

# Fit and transform the DataFrame
df_encoded = encoder.fit_transform(hotels_encoded)

# Display the encoded DataFrame
df_encoded.head(3)

In [None]:
df_encoded['days_since_review']= df_encoded['days_since_review'].apply(lambda x: int(x.split()[0]))
df_encoded.head(3)

In [None]:
from textblob import TextBlob

In [None]:
from textblob import TextBlob

df_encoded['negative_polarity'] = df_encoded['negative_review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_encoded['negative_subjectivity'] = df_encoded['negative_review'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

df_encoded['positive_polarity'] = df_encoded['positive_review'].apply(lambda x: TextBlob(x).sentiment.polarity)
df_encoded['positive_subjectivity'] = df_encoded['positive_review'].apply(lambda x: TextBlob(x).sentiment.subjectivity)


In [None]:
df_encoded.head(3)

In [None]:
df_encoded.info()

In [None]:
df_encoded.drop(['hotel_address', 'hotel_name', 'tags',
                'negative_review', 'positive_review'], axis=1, inplace=True)
df_encoded.info()

### Отбор признаков

In [None]:
num_cols=['total_number_of_reviews', 'review_total_negative_word_counts', 'review_total_positive_word_counts',
          'total_number_of_reviews_reviewer_has_given', 'lng', 
         'additional_number_of_scoring', 'days_since_review', 'negative_polarity', 'negative_subjectivity',
         'positive_polarity', 'positive_subjectivity'] 

cat_cols=df_encoded.drop(num_cols+['reviewer_score', 'sample'], axis=1).columns

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_num=df_encoded[num_cols]
correlation_matrix = df_num.corr(method='pearson')
threshold = 0.7
filtered_corr_matrix = correlation_matrix[(correlation_matrix.abs() >= threshold)]

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(filtered_corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, mask=filtered_corr_matrix.isnull())
plt.title("Correlation Matrix (|correlation| >= 0.7)")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_cat=df_encoded[cat_cols] 
correlation_matrix = df_cat.corr(method='spearman')
threshold = 0.7
filtered_corr_matrix = correlation_matrix[(correlation_matrix.abs() >= threshold)]

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(filtered_corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5, mask=filtered_corr_matrix.isnull())
plt.title("Correlation Matrix (|correlation| >= 0.7)")
plt.show()

In [None]:
# Теперь выделим тестовую часть
train_data = df_encoded.query('sample == 1').drop(['sample'], axis=1)
test_data = df_encoded.query('sample == 0').drop(['sample'], axis=1)

y = train_data.reviewer_score.values            # наш таргет
X = train_data.drop(['reviewer_score'], axis=1)

In [None]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
display('- Данные о пропусках в %')
missing_procent = round(X_train.isnull().sum() / len(X_train) * 100)
display(missing_procent)

In [None]:
# проверяем
test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [None]:
# Создаём модель (НАСТРОЙКИ НЕ ТРОГАЕМ)
model = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=RANDOM_SEED)

In [None]:
# Обучаем модель на тестовом наборе данных
model.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = model.predict(X_test)

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

In [None]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.maximum(np.ones(len(y_true)), np.abs(y_true))))*100

print('MAPE:', MAPE(y_test, y_pred))

In [None]:
# в RandomForestRegressor есть возможность вывести самые важные признаки для модели

plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')

In [None]:
test_data.sample(10)

In [None]:
test_data = test_data.drop(['reviewer_score'], axis=1)

In [None]:
sample_submission

In [None]:
predict_submission = model.predict(test_data)

In [None]:
predict_submission

In [None]:
list(sample_submission)

In [None]:
sample_submission['reviewer_score'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)