In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# импортируем библиотеки для визуализации
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

# Загружаем специальный удобный инструмент для разделения датасета:
from sklearn.model_selection import train_test_split

In [None]:
# фиксируйте RANDOM_SEED, чтобы эксперименты были воспроизводимы.
RANDOM_SEED = 42

In [None]:
# зафиксируем версию пакетов, чтобы эксперименты были воспроизводимы:
!pip freeze > requirements.txt

In [None]:
# Подгрузим наши данные из соревнования

DATA_DIR = '/kaggle/input/sf-booking/'
df_train = pd.read_csv(DATA_DIR+'/hotels_train.csv') # датасет для обучения
df_test = pd.read_csv(DATA_DIR+'hotels_test.csv') # датасет для предсказания
sample_submission = pd.read_csv(DATA_DIR+'/submission.csv') # самбмишн

*Объеденим Датасеты*

In [None]:
# для корректной обработки признаков объединяем трейн и тест в один датасет
df_train['sample'] = 1 # помечаем где у нас трейн
df_test['sample'] = 0 # помечаем где у нас тест
df_test['reviewer_score'] = 0 # в тесте у нас нет значения reviewer_score, мы его должны предсказать, по этому пока просто заполняем нулями

df = pd.concat([df_test, df_train], sort=False).reset_index(drop=True)# объединяем

# 1. Удаление дубликатов, фильтрация, заполнение пропусков.

## **1.  Поиск и удаление дубликатов**

In [None]:
# Проверка столбцов:
df.duplicated().sum() 

In [None]:
# Удаление всех дубликатов:
df.drop_duplicates(inplace=True)

##  **2. Исправляем проруски**

In [None]:
# Проверка на пропуски:
print("Пропуски в обучающем датасете:")
print(df.isnull().sum())

**Пропуски есть только в вдух признаках.**

**поппытаемся найти отели, в котрых отсутсвует данные.**

In [None]:
# Фильтруем датасет по наличию пропусков в lat и lng
hotels_without_geo = df[df['lat'].isnull() | df['lng'].isnull()]

# Выводим информацию об отелях с пропущенной геолокацией
print(hotels_without_geo[['hotel_name', 'hotel_address']])

In [None]:
# Находим дубликаты по столбцу 'hotel_name'
hotels_without_geo = hotels_without_geo.drop_duplicates(subset=['hotel_name'], keep='first')

In [None]:
display(hotels_without_geo)

Геоданные отсутствуют у 17 отелей:

In [None]:
def get_city_and_country(x):
    x = x.split()
    if x[-1] == 'Kingdom':
        city = x[-5]
        country = 'United Kingdom'
    else:
        city = x[-2]
        country = x[-1]
    return city, country, f'{city}, {country}'

df[['city', 'country', 'city_country']] = \
    df.apply(lambda x: get_city_and_country(x['hotel_address']), axis=1, result_type='expand')

display('Unique cities: {}'.format(df['city'].unique()))
display('Unique countries: {}'.format(df['country'].unique()))

In [None]:
#импортируем библиотеки для geocoders координат
!pip install geopy

In [None]:
from geopy.geocoders import Nominatim
from geopy import distance

In [None]:
coords = {}
geolocator = Nominatim(user_agent="DG data science project 3")

for loc in df['city_country'].unique():
    location = geolocator.geocode(loc)
    coords[loc] = (location.latitude, location.longitude)

display('City coordinates:')
display(coords)

In [None]:
df['lat'] = df.apply(lambda x: coords[x['city_country']][0] if pd.isnull(x['lat']) else x['lat'], axis=1)
df['lng'] = df.apply(lambda x: coords[x['city_country']][1] if pd.isnull(x['lng']) else x['lng'], axis=1)

In [None]:
print(df.isnull().sum())

# **2. Создание новых признаков**

In [None]:
display(df)

**Разбор отзывов на слова и выделение топ встречающихся по позитивным и негативным отзывам**

In [None]:
from textblob import TextBlob
from collections import Counter

In [None]:
df['positive_review']=df['positive_review'].fillna('')
df['negative_review']=df['negative_review'].fillna('')
#создаем признаки со списками слов после очистки в отзывах
df['pos_words']=df['positive_review'].apply(lambda x: x.split())
df['neg_words']=df['negative_review'].apply(lambda x: x.split())
# создаем общие списки негативных и позитивных слов
neg_word_list = []
for x in df['neg_words']: 
    neg_word_list.extend(x)

array = neg_word_list
neg_counter = Counter(array)
neg_top = neg_counter.most_common(200)
most_neg_words= [word for word, word_count in neg_top]
neg_words = set(most_neg_words)

pos_word_list = []
for x in df['pos_words']: 
    pos_word_list.extend(x)

array = pos_word_list
pos_counter = Counter(array)
pos_top = pos_counter.most_common(200)
most_pos_words= [word for word, word_count in pos_top]
pos_words = set(most_pos_words)

#создаем уникальные списки слов (без встречающихся и среди негативных и среди позитивных)
true_neg = neg_words - pos_words
true_pos = pos_words - neg_words

#счетчик количества позитивных слов в обоих типах отзывах
def num_pos_words(col):
    count = 0
    for word in col:
        if word in true_pos:
            count += 1
    return count
df['pos_words_in_pos_rev'] = df['pos_words'].apply(num_pos_words)
df['pos_words_in_neg_rev'] = df['neg_words'].apply(num_pos_words)

#счетчик количества негативных слов в обоих типах отзывах
def num_neg_words(col):
    count = 0
    for word in col:
        if word in true_neg:
            count += 1
    return count
df['neg_words_in_neg_rev'] = df['neg_words'].apply(num_neg_words)
df['neg_words_in_pos_rev'] = df['pos_words'].apply(num_neg_words)

In [None]:
# отношение количества позитивных и негативных слов к общей длине отзывов
df['pos_word_per_pos_rev'] = df['pos_words_in_pos_rev']/df['review_total_positive_word_counts']
df['neg_word_per_pos_rev'] = df['neg_words_in_pos_rev']/df['review_total_positive_word_counts']
df['pos_word_per_neg_rev'] = df['pos_words_in_neg_rev']/df['review_total_negative_word_counts']
df['neg_word_per_neg_rev'] = df['neg_words_in_neg_rev']/df['review_total_negative_word_counts']

df['pos_word_per_pos_rev'] = df['pos_word_per_pos_rev'].fillna(0)
df['neg_word_per_pos_rev'] = df['neg_word_per_pos_rev'].fillna(0)
df['pos_word_per_neg_rev'] = df['pos_word_per_neg_rev'].fillna(0)
df['neg_word_per_neg_rev'] = df['neg_word_per_neg_rev'].fillna(0)

df['pos_word_per_pos_rev'] = df['pos_word_per_pos_rev'].apply(lambda x: 0 if x == float("inf") else x )
df['neg_word_per_pos_rev'] = df['neg_word_per_pos_rev'].apply(lambda x: 0 if x == float("inf") else x )
df['pos_word_per_neg_rev'] = df['pos_word_per_neg_rev'].apply(lambda x: 0 if x == float("inf") else x )
df['neg_word_per_neg_rev'] = df['neg_word_per_neg_rev'].apply(lambda x: 0 if x == float("inf") else x )

In [None]:
#длина очищенных отзывов
df['clean_pos_len']=df['pos_words'].apply(lambda x: len(x))
df['clean_neg_len']=df['neg_words'].apply(lambda x: len(x))

**Анализ тональности с помощью TextBlob**

In [None]:
# анализ позитивного отзыва
hotels_arr_pos = df['positive_review'].to_numpy()
polarity_arr=[]
subjectivity_arr=[]
for a in hotels_arr_pos:
    text=a
    testimonial = TextBlob(text)
    testimonial.sentiment
    polarity_arr.append(testimonial.sentiment.polarity)
    subjectivity_arr.append(testimonial.sentiment.subjectivity)
df["pos_review_polarity"]=polarity_arr
df["pos_review_subjectivity"]=subjectivity_arr

In [None]:
# анализ негативного отзыва
hotels_arr_neg = df['negative_review'].to_numpy()
polarity_arr=[]
subjectivity_arr=[]
for a in hotels_arr_neg:
    text=a
    testimonial = TextBlob(text)
    testimonial.sentiment
    polarity_arr.append(testimonial.sentiment.polarity)
    subjectivity_arr.append(testimonial.sentiment.subjectivity)
df["neg_review_polarity"]=polarity_arr
df["neg_review_subjectivity"]=subjectivity_arr

**Анализ тональности с помощью Vader**

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [None]:
analyz = SentimentIntensityAnalyzer()

df['negative_review_analyze'] = df['negative_review'].apply(lambda x: analyz.polarity_scores(x))
df['positive_review_analyze'] = df['positive_review'].apply(lambda x: analyz.polarity_scores(x))

df.loc[:,['neg_rev_neg_mood_score', 'neg_rev_neutral_mood_score','neg_rev_pos_mood_score','neg_rev_mood_score']] = list(df['negative_review_analyze'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).values)
df.loc[:,['pos_rev_neg_mood_score', 'pos_rev_neutral_mood_score','pos_rev_pos_mood_score','pos_rev_mood_score']] = list(df['positive_review_analyze'].apply(lambda x: [x['neg'], x['neu'], x['pos'], x['compound']]).values)

**Извлечение признаков из даты отзыва**

In [None]:
df['review_date'] = pd.to_datetime(df['review_date'])
df['year'] = df['review_date'].dt.year
df['month'] = df['review_date'].dt.month
df['dayofweek'] = df['review_date'].dt.dayofweek
df['weekend'] = df['dayofweek'].apply(lambda x: 0 if x < 5 else 1)

**Извлечение признаков из адреса отеля**

In [None]:
# название города
def city_from_adress(col):
    res = col.split(' ')[-2]
    if res == 'United':
        res = col.split(' ')[-5]
    return res
df['hotel_city'] = df['hotel_address'].apply(city_from_adress)
city_list = df['hotel_city'].unique()

In [None]:
pip install geocoder

In [None]:
import geocoder

In [None]:
# Извлечение названий городов и координат
city_names = [key.split(',')[0].strip() for key in coords]
city_lats = [value[0] for value in coords.values()]
city_lngs = [value[1] for value in coords.values()]

# Создание DataFrame
city_coords = pd.DataFrame({
    'hotel_city': city_names,
    'city_lat': city_lats,
    'city_lng': city_lngs
})

print(city_coords)

In [None]:
df = df.merge(city_coords, on=['hotel_city'], how = 'left')
df ['coords'] = list(zip(df['lat'],df['lng'], df['city_lat'], df['city_lng']))
city_coords

In [None]:
import geopy.distance

In [None]:
# расстояние до центра используя geopy.distance
def distance(col):
    coords_1 = col[:2]
    coords_2 = col[-2:]
    return geopy.distance.geodesic(coords_1, coords_2).m
df['distance_from_center'] = df ['coords'].apply(distance)

In [None]:
# отзыв резидента
df['reviewer_nationality'] = df['reviewer_nationality'].apply(lambda x: x.strip())
df['home_review'] = np.where((df['country'] == df['reviewer_nationality']), 1, 0)

**Название отеля**

In [None]:
# топ 50
df_name_list =(df['hotel_name'].value_counts(normalize=True).nlargest(50))
df['hotel_name'] = df['hotel_name'].apply(lambda x: x.strip() if x in df_name_list else 'other') 

**Национальность автора**

In [None]:
# отсавляем топ 10 популярных
reviewer_nationality_list =(df['reviewer_nationality'].value_counts(normalize=True).nlargest(10))
df['reviewer_nationality'] = df['reviewer_nationality'].apply(lambda x: x if x in reviewer_nationality_list else 'other')
reviewer_nationality_list*100

**Теги**

In [None]:
import re

In [None]:
#представление тегов списком
def tags(col):
    res = []
    tag_split = col.split(',')
    for tag in tag_split:
        reg = re.compile('[^a-zA-Z0-9 ]')
        res.append(reg.sub('', tag).strip())
    return res
df['new_tags'] = df['tags'].apply(tags)

In [None]:
# продолжительность пребывания
def night_number(col):
    for tag in col:
        if tag.split(' ')[0] == 'Stayed':
            return int(tag.split(' ')[1])
df['night_number'] = df['new_tags'].apply(night_number)
df['night_number'] = df['night_number'].fillna(df['night_number'].median())
df['night_number'] = df['night_number'].apply(lambda x: x if x <= 7 else 10)

In [None]:
# теги описание поездки
conditions = ['Couple','Solo traveler','Business trip','Family with young children','Group','Family with older children','With a pet']

df['Couple']=''
df['Solo traveler']=''
df['Business trip']=''
df['Family with young children'] = ''
df['Group']=''
df['Family with older children']=''
df['With a pet']=''

for col in df[conditions].columns:
    df[col]=df['new_tags'].apply(lambda x: 1 if col in x else 0)

In [None]:
# тип комнаты
def room_type(col):
    for tag in col:
        if tag.split(' ')[-1] == 'Room':
            return tag[:-4]
df['room_type'] = df['new_tags'].apply(room_type)

room_type =(df['room_type'].value_counts(normalize=True).nlargest(15))
df['room_type'] = df['room_type'].apply(lambda x: x.strip() if x in room_type else 'other')
room_type*100

In [None]:
# количество тегов
df['tags_length'] = df['new_tags'].apply(lambda x: len(x))

**Возраст отзыва**

In [None]:
#выделяем количество дней
df['num_days_since_review'] = df['days_since_review'].apply(lambda x: int(x.split(' ')[0]))

# 3. **Нормализация данных**

In [None]:
display(df.info())

In [None]:
# список численных признаков
num_cols = ['review_total_negative_word_counts',
            'review_total_positive_word_counts',
            'additional_number_of_scoring',
            'total_number_of_reviews_reviewer_has_given', 
            'total_number_of_reviews',
            'average_score',
            'pos_review_polarity',
            'pos_review_subjectivity',
            'neg_review_polarity',
            'neg_review_subjectivity',
            'distance_from_center',
            'num_days_since_review',
            'neg_rev_neutral_mood_score',
            'neg_rev_pos_mood_score',
            'neg_rev_mood_score',
            'pos_rev_neg_mood_score',
            'pos_rev_pos_mood_score',
            'pos_rev_mood_score',
            'pos_words_in_neg_rev',
            'neg_words_in_pos_rev',
            'pos_words_in_pos_rev',
            'neg_words_in_neg_rev',
            'pos_word_per_pos_rev',
            'neg_word_per_pos_rev',
            'pos_word_per_neg_rev',
            'neg_word_per_neg_rev',
            'clean_neg_len',
            'clean_pos_len'
            ]
df[num_cols].hist(figsize=(20, 16))

In [None]:
# результат
df[num_cols].hist(figsize=(20, 16))

# 4. **Кодирование признаков**

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# Просто LabelEncoder
df['reviewer_nationality'] = LabelEncoder().fit_transform(df['reviewer_nationality'])
df['night_number'] = LabelEncoder().fit_transform(df['night_number'])
df['country'] = LabelEncoder().fit_transform(df['country'])
df['room_type'] = LabelEncoder().fit_transform(df['room_type'])
df['hotel_name'] = LabelEncoder().fit_transform(df['hotel_name'])

In [None]:
df.describe()

In [None]:
df = df.drop(['review_date'], axis=1)

In [None]:
df.describe()

In [None]:
df.info()

# **5. Очистка данных**

In [None]:
df['lng']

In [None]:
df = df.drop(['lat', 'lng', 'city_lat', 'city_lng'], axis=1)

In [None]:
hotels_columns = [s for s in df.columns if df[s].dtypes == 'object']
df.drop(hotels_columns, axis = 1, inplace=True)

In [None]:
df.info()

In [None]:
fig, ax = plt.subplots(figsize=(30,20))
sns.heatmap(df.corr(), annot=True, linewidths=.5, ax=ax)

In [None]:
# анализ мультиколлинеарности
pivot = df.corr()
pivot = pivot.drop('sample', axis=0)
pivot = pivot.drop('sample', axis=1)
for col in pivot:
    pivot[col] = pivot[col].apply(lambda x: np.nan if (abs(x) < 0.72 or x == 1) else x)
for col in pivot:
    pivot = pivot.dropna(how='all')
    pivot = pivot.dropna(how='all', axis='columns')

multi_corr_list = []
for lower_bound in np.linspace(0.98, 0.72, num=14):
    for col in pivot.columns:
        if pivot[col].max() > lower_bound or pivot[col].min() < -lower_bound:
            multi_corr_list.append(col)
print(set(multi_corr_list))

In [None]:
droplist = ['additional_number_of_scoring',
            'dayofweek',
            'year',
            'pos_rev_neutral_mood_score',
            'neg_rev_neg_mood_score',
            'review_total_negative_word_counts',
            'review_total_positive_word_counts'
           ]
df = df.drop(droplist, axis=1)

In [None]:
for col in num_cols:
    if col in droplist:
        num_cols.remove(col)
num_cols

In [None]:
# перед обучением модели явно укажем категориальный тип 
cat_cols = ['country',
            'hotel_name',
            'month', 
            'night_number', 
            'reviewer_nationality', 
            'room_type', 
            'weekend',
            'home_review',
            'tags_length',
            'Couple',
            'Solo traveler',
            'Business trip',
            'Family with young children',
            'Group',
            'Family with older children',
            'With a pet'
            ]
df[cat_cols] = df[cat_cols].astype('category')

In [None]:
hotels_selective = df.copy()
train_data = hotels_selective.query('sample == 1').drop(['sample'], axis=1)
test_data = hotels_selective.query('sample == 0').drop(['sample'], axis=1)

In [None]:
X = train_data.drop(['reviewer_score'], axis = 1)  
y = train_data['reviewer_score'] 
y=y.astype('int') 

# **6. Проверка важности признаков**

In [None]:
# Теперь выделим тестовую часть
train_data = df.query('sample == 1').drop(['sample'], axis=1)
test_data = df.query('sample == 0').drop(['sample'], axis=1)

y = train_data.reviewer_score.values            # наш таргет
X = train_data.drop(['reviewer_score'], axis=1)

In [None]:
# Воспользуемся специальной функцие train_test_split для разбивки тестовых данных
# выделим 20% данных на валидацию (параметр test_size)
#!pip install scikit-learn
from sklearn.model_selection import train_test_split
RANDOM_SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# проверяем
test_data.shape, train_data.shape, X.shape, X_train.shape, X_test.shape

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import RidgeCV
from sklearn import linear_model
final_layer_rfr = RandomForestRegressor(n_estimators=300, max_depth = 20, min_samples_leaf= 9,random_state=42)
final_layer_gbr = GradientBoostingRegressor(n_estimators=300, max_depth = 4, min_samples_leaf= 4, learning_rate = 0.39044982169012865, random_state=42)
final_layer = StackingRegressor(estimators=[('rf', final_layer_rfr), ('gbrt', final_layer_gbr)], final_estimator=RidgeCV())

In [None]:
# Обучаем модель на тестовом наборе данных
final_layer.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = final_layer.predict(X_test)

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))

In [None]:
y_pred.shape

In [None]:
# в RandomForestRegressor есть возможность вывести самые важные признаки для модели
plt.rcParams['figure.figsize'] = (10,10)
feat_importances = pd.Series(final_layer.feature_importances_, index=X.columns)
feat_importances.nlargest(30).plot(kind='barh')

In [None]:
feat_importances.sort_values(inplace = True, ascending = False)
feat_importances*100

In [None]:
test_data.sample(10)

In [None]:
test_data = test_data.drop(['reviewer_score'], axis=1)

In [None]:
sample_submission

In [None]:
predict_submission = final_layer.predict(test_data)

In [None]:
predict_submission.shape

In [None]:
sample_submission = sample_submission.dropna(how='any', axis=0)
sample_submission.shape

In [None]:
sample_submission['reviewer_score'] = predict_submission
sample_submission.to_csv('submission.csv', index=False)

In [None]:
sample_submission.head(10)