# <center> Project-3: Обучение модели для предсказания рейтинга отеля по данным сайта Booking

In [81]:
import pandas as pd
import math
import category_encoders as ce # импортируем библиотеку для работы с кодировщиками
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import preprocessing # для нормализации, стандартизации
import seaborn as sns

hotels = pd.read_csv('../data/hotels.csv')
hotels.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643
3,216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,34,9/22/2015,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,No Negative,0,607,Friendly staff quiet comfortable room spotles...,11,8,10.0,"[' Leisure trip ', ' Solo traveler ', ' Standa...",681 day,48.888697,2.39454
4,Molenwerf 1 1014 AG Amsterdam Netherlands,914,3/5/2016,8.5,Golden Tulip Amsterdam West,Poland,Torn sheets,4,7586,The staff was very friendly and helpful Break...,20,10,9.6,"[' Business trip ', ' Couple ', ' Standard Dou...",516 day,52.385601,4.84706


* hotel_address — адрес отеля;
* review_date — дата, когда рецензент разместил соответствующий отзыв;
* average_score — средний балл отеля, рассчитанный на основе последнего комментария за последний год;
* hotel_name — название отеля;
* reviewer_nationality — страна рецензента;
* negative_review — отрицательный отзыв, который рецензент дал отелю;
* review_total_negative_word_counts — общее количество слов в отрицательном отзыв;
* positive_review — положительный отзыв, который рецензент дал отелю;
* review_total_positive_word_counts — общее количество слов в положительном отзыве.
* reviewer_score — оценка, которую рецензент поставил отелю на основе своего опыта;
* total_number_of_reviews_reviewer_has_given — количество отзывов, которые рецензенты дали в прошлом;
* total_number_of_reviews — общее количество действительных отзывов об отеле;
* tags — теги, которые рецензент дал отелю;
* days_since_review — количество дней между датой проверки и датой очистки;
* additional_number_of_scoring — есть также некоторые гости, которые просто поставили оценку сервису, но не оставили отзыв. Это число указывает, сколько * там действительных оценок без проверки.
* lat — географическая широта отеля;
* lng — географическая долгота отеля.

## 1. Исследование структуры данных

In [82]:
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 17 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   hotel_address                               386803 non-null  object 
 1   additional_number_of_scoring                386803 non-null  int64  
 2   review_date                                 386803 non-null  object 
 3   average_score                               386803 non-null  float64
 4   hotel_name                                  386803 non-null  object 
 5   reviewer_nationality                        386803 non-null  object 
 6   negative_review                             386803 non-null  object 
 7   review_total_negative_word_counts           386803 non-null  int64  
 8   total_number_of_reviews                     386803 non-null  int64  
 9   positive_review                             386803 non-null  object 
 

In [83]:
print("Количество уникальных названий отелей представлено в наборе данных: ", len(hotels['hotel_name'].unique()))

Количество уникальных названий отелей представлено в наборе данных:  1492


In [84]:
print("Даты размещения отзывов:\n", pd.to_datetime(hotels['review_date']).sort_values())

Даты размещения отзывов:
 143997   2015-08-04
86268    2015-08-04
159209   2015-08-04
217001   2015-08-04
28485    2015-08-04
            ...    
167325   2017-08-03
372341   2017-08-03
54669    2017-08-03
321881   2017-08-03
315137   2017-08-03
Name: review_date, Length: 386803, dtype: datetime64[ns]


Выясним, сколько уникальных тегов представлено в наборе данных и какой самый популярный

In [85]:
dict_tags = {}
n = len(hotels['tags'])

for i in range(n):
    for tag in hotels['tags'].iloc[i].replace("[","").replace("]","").replace("'","").split(sep=','):    
        tmp = tag.strip()
        count = dict_tags.get(tmp,0)
        dict_tags[tmp] = count+1
        
df_tags = pd.DataFrame.from_dict(dict_tags, orient='index').reset_index()
df_tags.columns = ['tag', 'count']
print(df_tags.sort_values('count', ascending=False))

                                    tag   count
0                          Leisure trip  313593
4        Submitted from a mobile device  230778
1                                Couple  189212
7                        Stayed 1 night  145373
3                       Stayed 2 nights  100263
...                                 ...     ...
2115             Studio with Spa Access       1
1024                Comfort Family Room       1
2112             Junior Suite Free Wifi       1
977                        Design Suite       1
2367  Executive Double Room Non Smoking       1

[2368 rows x 2 columns]


## 2. Извлечение информации из строковых данных, которую в дальнейшем можно представить в числовом виде.


2.1. Из признака "адрес отеля" выделим новые признаки "город" и "страна"

In [86]:
hotels['country'] = hotels['hotel_address'].apply(lambda x: x.split()[-1] if x.split()[-1]!='Kingdom' else (x.split()[-2]+' '+x.split()[-1]))
hotels['sity'] = hotels['hotel_address'].apply(lambda x: x.split()[-2] if x.split()[-2]!='United' else x.split()[-5])

2.2. Рассмотрим признак "review_date", - выделим из него год и месяц и сразу преобразуем их в числовые признаки

In [87]:
hotels['year'] = (pd.to_datetime(hotels['review_date']).dt.year).astype(int)
hotels['month'] = (pd.to_datetime(hotels['review_date']).dt.month).astype(int)

2.3. Из значения признака "days_since_review" уберем слово "days" и сразу преобразуем его из строкового в числовой

In [88]:
hotels['days_since_review'] = hotels['days_since_review'].apply(lambda x: int(x.split()[0]))

2.4. Проанализируем позитивные и негативные отзывы с помощью SentimentIntensityAnalyzer, из четырех категорий словаря оценок выберем составную оценку ("compound").

In [89]:
nltk.downloader.download('vader_lexicon')
sent_analyzer = SentimentIntensityAnalyzer()
pos = hotels["positive_review"].apply(lambda x: sent_analyzer.polarity_scores(x)["compound"])
neg = hotels["negative_review"].apply(lambda x: sent_analyzer.polarity_scores(x)["compound"])

hotels["polarity_pos"] = pos
hotels["polarity_neg"] = neg

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Надежда\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


2.5. Сформируем новый бинарный признак "tags_bin", который будет обозначать, входит ли один из тегов, который указал пользователь в десятку самых популярных.

In [90]:
def func(arg):
    list_tags = arg.replace("[","").replace("]","").replace("'","").split(sep=',')
    for i in list_tags:
        tmp = i.strip()
        if tmp in list(df_tags['tag'].iloc[:10]):
            return 1
        else:
            return 0

hotels['tags_bin'] = hotels['tags'].apply(func)
print(hotels['tags_bin'].value_counts())

1    382176
0      4627
Name: tags_bin, dtype: int64


2.6. Удалим строковые признаки, которые нам больше не нужны, т.к. мы выделили из них необходимую информацию.

In [91]:
hotels = hotels.drop(['hotel_address', 'review_date', 'positive_review', 'negative_review', 'tags'], axis=1)
hotels.info()
display(hotels)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386803 entries, 0 to 386802
Data columns (total 19 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   additional_number_of_scoring                386803 non-null  int64  
 1   average_score                               386803 non-null  float64
 2   hotel_name                                  386803 non-null  object 
 3   reviewer_nationality                        386803 non-null  object 
 4   review_total_negative_word_counts           386803 non-null  int64  
 5   total_number_of_reviews                     386803 non-null  int64  
 6   review_total_positive_word_counts           386803 non-null  int64  
 7   total_number_of_reviews_reviewer_has_given  386803 non-null  int64  
 8   reviewer_score                              386803 non-null  float64
 9   days_since_review                           386803 non-null  int64  
 

Unnamed: 0,additional_number_of_scoring,average_score,hotel_name,reviewer_nationality,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,days_since_review,lat,lng,country,sity,year,month,polarity_pos,polarity_neg,tags_bin
0,581,8.4,The May Fair Hotel,United Kingdom,3,1994,4,7,10.0,531,51.507894,-0.143671,United Kingdom,London,2016,2,0.5859,0.0000,1
1,299,8.3,Mercure London Bloomsbury Hotel,United Kingdom,3,1361,2,14,6.3,203,51.521009,-0.123097,United Kingdom,London,2017,1,0.0000,-0.4767,1
2,32,8.9,Legend Saint Germain by Elegancia,China,6,406,0,14,7.5,289,48.845377,2.325643,France,Paris,2016,10,0.3400,-0.2960,1
3,34,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,0,607,11,8,10.0,681,48.888697,2.394540,France,Paris,2015,9,0.9169,-0.7096,1
4,914,8.5,Golden Tulip Amsterdam West,Poland,4,7586,20,10,9.6,516,52.385601,4.847060,Netherlands,Amsterdam,2016,3,0.8503,-0.2500,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386798,107,9.0,Hotel Moonlight,France,0,617,10,10,8.8,106,51.494028,-0.191050,United Kingdom,London,2017,4,0.0000,-0.7096,1
386799,272,8.4,BEST WESTERN PLUS Amedia Wien,Turkey,0,3224,93,1,9.2,171,48.192379,16.399451,Austria,Vienna,2017,2,0.7264,-0.7096,1
386800,457,6.8,Bloomsbury Palace Hotel,Netherlands,12,2751,9,21,8.3,543,51.520795,-0.131084,United Kingdom,London,2016,2,0.7845,0.0000,1
386801,365,8.1,The Marble Arch London,United Arab Emirates,0,1567,6,28,9.2,74,51.515125,-0.160066,United Kingdom,London,2017,5,0.0000,-0.7096,1


## 3. Очистка от пропущенных значений

Мы не можем просто заполнить пропущенные значения в столбцах "lat" и "lng" (широта/долгота) усредненным/медианным/модальным значением по этим столбцам, т.к. это противоречит логике формирования этих столбцов. Посчитаем усредненное или медианное значение широты и долготы для каждого города отдельно и уже ими заполним соответствующие пропуски.

In [92]:
lat_sity = hotels.groupby(
    by=['sity'],
    as_index=False
)['lat'].mean()

lng_sity = hotels.groupby(
    by=['sity'],
    as_index=False
)['lng'].median()

print(lat_sity)

count=0
#index=0
for i,j in enumerate(hotels['lat']):    
    if  math.isnan(hotels['lat'].iloc[i]): # если отсутствует значение        
        tmp_sity = hotels['sity'].iloc[i] 

        #if index<3: # для проверки выводим первые 5 пустых значений  
            #print('tmp_sity = ', tmp_sity, '\n', hotels[['sity', 'lat']].iloc[i]) 
        hotels['lat'].iloc[i] = lat_sity[lat_sity['sity'] == tmp_sity]['lat']  
        #if index<3:
            #print('после заполнения пропуска:\n', hotels[['sity', 'lat']].iloc[i])  
            #index += 1 
        
    if  math.isnan(hotels['lng'].iloc[i]):        
        tmp_sity = hotels['sity'].iloc[i]        
        hotels['lng'].iloc[i] = lng_sity[lng_sity['sity'] == tmp_sity]['lng']   
        
#hotels.info()

        sity        lat
0  Amsterdam  52.362209
1  Barcelona  41.389125
2     London  51.510737
3      Milan  45.479619
4      Paris  48.863658
5     Vienna  48.203368


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotels['lat'].iloc[i] = lat_sity[lat_sity['sity'] == tmp_sity]['lat']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotels['lng'].iloc[i] = lng_sity[lng_sity['sity'] == tmp_sity]['lng']


## 4. Преобразование признаков. Кодирование. Нормализация. Стандартизация.

In [93]:
print("Количество уникальных значений в столбцах:")
print('hotel_name: ', len(hotels['hotel_name'].value_counts()))
print('reviewer_nationality:', len(hotels['reviewer_nationality'].value_counts()))
print('country: ', len(hotels['country'].value_counts()))
print('sity: ', len(hotels['sity'].value_counts()))

Количество уникальных значений в столбцах:
hotel_name:  1492
reviewer_nationality: 225
country:  6
sity:  6


К столбцам "country" и "sity" применим однократное кодирование, к столбцам "hotel_name" и "reviewer_nationality" - двоичное

In [94]:
encoder = ce.OneHotEncoder(cols=['country', 'sity'], use_cat_names=True) 
one_encoder = encoder.fit_transform(hotels[['country', 'sity']])

hotels = hotels.drop(['country', 'sity'], axis=1)
hotels = pd.concat([hotels, one_encoder], axis=1)

In [95]:
bin_encoder = ce.BinaryEncoder(cols=['reviewer_nationality', 'hotel_name']) # указываем столбец для кодирования
type_bin = bin_encoder.fit_transform(hotels[['reviewer_nationality', 'hotel_name']])

hotels = hotels.drop(['reviewer_nationality', 'hotel_name'], axis=1)
hotels = pd.concat([hotels,type_bin], axis=1)

display(hotels)

Unnamed: 0,additional_number_of_scoring,average_score,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,days_since_review,lat,lng,...,hotel_name_1,hotel_name_2,hotel_name_3,hotel_name_4,hotel_name_5,hotel_name_6,hotel_name_7,hotel_name_8,hotel_name_9,hotel_name_10
0,581,8.4,3,1994,4,7,10.0,531,51.507894,-0.143671,...,0,0,0,0,0,0,0,0,0,1
1,299,8.3,3,1361,2,14,6.3,203,51.521009,-0.123097,...,0,0,0,0,0,0,0,0,1,0
2,32,8.9,6,406,0,14,7.5,289,48.845377,2.325643,...,0,0,0,0,0,0,0,0,1,1
3,34,7.5,0,607,11,8,10.0,681,48.888697,2.394540,...,0,0,0,0,0,0,0,1,0,0
4,914,8.5,4,7586,20,10,9.6,516,52.385601,4.847060,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386798,107,9.0,0,617,10,10,8.8,106,51.494028,-0.191050,...,1,0,1,1,1,0,1,0,1,1
386799,272,8.4,0,3224,93,1,9.2,171,48.192379,16.399451,...,0,1,0,1,0,1,0,0,1,1
386800,457,6.8,12,2751,9,21,8.3,543,51.520795,-0.131084,...,0,1,1,0,0,0,0,1,0,0
386801,365,8.1,0,1567,6,28,9.2,74,51.515125,-0.160066,...,0,1,0,0,1,1,0,0,0,0


Проведем нормализацию некоторых признаков, но сначала оценим их характеристики - среднее значение и отклонение

In [96]:
print(hotels['additional_number_of_scoring'].mean(), hotels['additional_number_of_scoring'].std(), ' additional_number_of_scoring')
print(hotels['average_score'].mean(), hotels['average_score'].std(), ' average_score')
print(hotels['review_total_negative_word_counts'].mean(), hotels['review_total_negative_word_counts'].std(), ' review_total_negative_word_counts')
print(hotels['total_number_of_reviews'].mean(),hotels['total_number_of_reviews'].std(), ' total_number_of_reviews')
print(hotels['review_total_positive_word_counts'].mean(),hotels['review_total_positive_word_counts'].std(), ' review_total_positive_word_counts')
print(hotels['total_number_of_reviews_reviewer_has_given'].mean(),hotels['total_number_of_reviews_reviewer_has_given'].std(), ' total_number_of_reviews_reviewer_has_given')
print(hotels['days_since_review'].mean(),hotels['days_since_review'].std(), ' days_since_review')

498.24653635054534 500.25801206714215  additional_number_of_scoring
8.397230890143648 0.5478808088554245  average_score
18.538987546632264 29.70336908497516  review_total_negative_word_counts
2743.992042460891 2316.457018227235  total_number_of_reviews
17.776984666613238 21.726141151924956  review_total_positive_word_counts
7.177250434975944 11.054419786977231  total_number_of_reviews_reviewer_has_given
354.38797785953057 208.97517358180167  days_since_review


In [97]:
r_scaler = preprocessing.RobustScaler()

# кодируем исходный датасет
df_mm = r_scaler.fit_transform(hotels[['additional_number_of_scoring', 'average_score', 'review_total_negative_word_counts', 
                                       'total_number_of_reviews', 'review_total_positive_word_counts', 
                                        'total_number_of_reviews_reviewer_has_given', 'days_since_review']])

# Преобразуем промежуточный датасет в полноценный датафрейм 
df_mm = pd.DataFrame(df_mm, columns=['additional_number_of_scoring', 'average_score', 'review_total_negative_word_counts', 
                                       'total_number_of_reviews', 'review_total_positive_word_counts', 
                                        'total_number_of_reviews_reviewer_has_given', 'days_since_review'])

hotels = hotels.drop(['additional_number_of_scoring', 'average_score', 'review_total_negative_word_counts', 
                                       'total_number_of_reviews', 'review_total_positive_word_counts', 
                                        'total_number_of_reviews_reviewer_has_given', 'days_since_review'], axis=1)
hotels = pd.concat([hotels,df_mm], axis=1)
display(df_mm.describe())
display(hotels)

Unnamed: 0,additional_number_of_scoring,average_score,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,days_since_review
count,386803.0,386803.0,386803.0,386803.0,386803.0,386803.0,386803.0
mean,0.318221,-0.003956,0.454238,0.248773,0.398646,0.59675,0.003943
std,1.018855,0.782687,1.414446,0.944721,1.278008,1.579203,0.593679
min,-0.694501,-4.571429,-0.428571,-0.852773,-0.647059,-0.285714,-1.002841
25%,-0.352342,-0.428571,-0.333333,-0.396819,-0.352941,-0.285714,-0.505682
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.647658,0.571429,0.666667,0.603181,0.647059,0.714286,0.494318
max,4.765784,2.0,19.0,5.928222,22.588235,50.285714,1.071023


Unnamed: 0,reviewer_score,lat,lng,year,month,polarity_pos,polarity_neg,tags_bin,country_United Kingdom,country_France,...,hotel_name_8,hotel_name_9,hotel_name_10,additional_number_of_scoring,average_score,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,days_since_review
0,10.0,51.507894,-0.143671,2016,2,0.5859,0.0000,1,1,0,...,0,0,1,0.486762,0.000000,-0.285714,-0.057096,-0.411765,0.571429,0.505682
1,6.3,51.521009,-0.123097,2017,1,0.0000,-0.4767,1,1,0,...,0,1,0,-0.087576,-0.142857,-0.285714,-0.315253,-0.529412,1.571429,-0.426136
2,7.5,48.845377,2.325643,2016,10,0.3400,-0.2960,1,0,1,...,0,1,1,-0.631365,0.714286,-0.142857,-0.704731,-0.647059,1.571429,-0.181818
3,10.0,48.888697,2.394540,2015,9,0.9169,-0.7096,1,0,1,...,1,0,0,-0.627291,-1.285714,-0.428571,-0.622757,0.000000,0.714286,0.931818
4,9.6,52.385601,4.847060,2016,3,0.8503,-0.2500,1,0,0,...,1,0,1,1.164969,0.142857,-0.238095,2.223491,0.529412,1.000000,0.463068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386798,8.8,51.494028,-0.191050,2017,4,0.0000,-0.7096,1,1,0,...,0,1,1,-0.478615,0.857143,-0.428571,-0.618679,-0.058824,1.000000,-0.701705
386799,9.2,48.192379,16.399451,2017,2,0.7264,-0.7096,1,0,0,...,0,1,1,-0.142566,0.000000,-0.428571,0.444535,4.823529,-0.285714,-0.517045
386800,8.3,51.520795,-0.131084,2016,2,0.7845,0.0000,1,1,0,...,1,0,0,0.234216,-2.285714,0.142857,0.251631,-0.117647,2.571429,0.539773
386801,9.2,51.515125,-0.160066,2017,5,0.0000,-0.7096,1,1,0,...,0,0,0,0.046843,-0.428571,-0.428571,-0.231240,-0.294118,3.571429,-0.792614


Для всех признаков рассматриваемых в этом разделе провели нормализацию. Стандартизацию проводить не будем =)

## 5. Отбор признаков

Построим матрицу корреляции и удалим признаки с очень сильной корреляцией (где коэффициент корреляции +/-0.7 и выше). Т.к. матрица имеет достаточно большую размерность, - автоматизируем процесс нахождения столбцов с сильной кореляцией.

In [98]:
matr_corr = hotels.corr()
display(matr_corr)

df_mc = pd.DataFrame(matr_corr)

row_name = df_mc.columns
col_name = df_mc.index

print("Признаки с сильной корреляцией: \n")
for i in range(len(df_mc.columns)):    
    for j in range(len(df_mc.index)):        
        if row_name[i] != col_name[j] and i<j and (df_mc[row_name[i]].iloc[j]>=0.7 or df_mc[row_name[i]].iloc[j]<=-0.7):
            print(row_name[i], col_name[j], df_mc[row_name[i]].iloc[j])


Unnamed: 0,reviewer_score,lat,lng,year,month,polarity_pos,polarity_neg,tags_bin,country_United Kingdom,country_France,...,hotel_name_8,hotel_name_9,hotel_name_10,additional_number_of_scoring,average_score,review_total_negative_word_counts,total_number_of_reviews,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,days_since_review
reviewer_score,1.0,-0.036836,0.032558,0.011043,-0.032397,0.381877,-0.031121,0.015625,-0.043552,0.00524,...,0.004343,0.004979,0.013533,-0.061773,0.363915,-0.383407,-0.073389,0.220294,0.002498,0.002408
lat,-0.036836,1.0,-0.302428,0.01068,-0.02588,-0.028508,0.016596,0.00174,0.610783,-0.059601,...,0.048403,-0.062193,-0.011459,0.337425,-0.107595,0.019437,0.101132,-0.038695,-0.059452,1.4e-05
lng,0.032558,-0.302428,1.0,-0.010312,0.025824,0.023645,-0.015784,-0.011226,-0.661089,-0.043887,...,0.022642,0.065904,-0.001409,-0.298158,0.100065,-0.020535,-0.015955,0.035124,0.110701,-0.000391
year,0.011043,0.01068,-0.010312,1.0,-0.58147,0.01461,0.050221,-0.00586,0.00412,-0.015317,...,0.01588,0.001813,-0.00077,-0.004939,0.015777,0.038963,-0.004386,0.048889,-0.036136,-0.917355
month,-0.032397,-0.02588,0.025824,-0.58147,1.0,-0.02559,-0.02093,-0.000424,-0.023756,0.012758,...,-0.008369,-0.00033,-0.009584,-0.013643,-0.00952,-0.009451,-0.005048,-0.026909,0.015342,0.211401
polarity_pos,0.381877,-0.028508,0.023645,0.01461,-0.02559,1.0,0.01058,0.019954,-0.041396,0.007897,...,0.000925,0.007622,0.013013,-0.060917,0.161882,0.004846,-0.052001,0.42005,0.012659,-0.00507
polarity_neg,-0.031121,0.016596,-0.015784,0.050221,-0.02093,0.01058,1.0,0.001199,0.016069,-0.012562,...,0.000324,-0.002502,-0.002403,0.013881,-0.019926,0.200842,0.01099,0.052627,0.009014,-0.050129
tags_bin,0.015625,0.00174,-0.011226,-0.00586,-0.000424,0.019954,0.001199,1.0,-0.000245,-0.001243,...,-0.001907,0.001354,0.008686,0.004028,0.018399,-0.001219,0.00034,0.012022,0.004329,0.007285
country_United Kingdom,-0.043552,0.610783,-0.661089,0.00412,-0.023756,-0.041396,0.016069,-0.000245,1.0,-0.368436,...,0.014858,-0.110375,-0.032675,0.441168,-0.105822,0.022883,0.10685,-0.061724,-0.086079,0.007054
country_France,0.00524,-0.059601,-0.043887,-0.015317,0.012758,0.007897,-0.012562,-0.001243,-0.368436,1.0,...,-0.005632,0.001477,0.010363,-0.253002,0.007586,-0.021332,-0.229234,0.018812,0.019623,0.011982


Признаки с сильной корреляцией: 

lat country_Spain -0.8447475802538315
lat sity_Barcelona -0.8447475802538315
lng country_Austria 0.8248297217147678
lng sity_Vienna 0.8248297217147678
year days_since_review -0.9173550540665756
country_United Kingdom sity_London 1.0
country_France sity_Paris 1.0
country_Netherlands sity_Amsterdam 1.0
country_Italy sity_Milan 1.0
country_Austria sity_Vienna 1.0
country_Spain sity_Barcelona 1.0
additional_number_of_scoring total_number_of_reviews 0.8244671077888321


In [99]:
hotels = hotels.drop(['sity_Barcelona', 'sity_Vienna', 'year', 'country_United Kingdom', 'country_France', 'country_Netherlands', 'country_Italy', 
                      'country_Austria', 'country_Spain', 'additional_number_of_scoring'], axis=1)
#display(hotels)
#display(hotels.corr())

## 6. Обучение модели, получение итоговой метрики

In [100]:
# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели  
# Х - данные с информацией об отелях, у - целевая переменная (рейтинги отелей)  
X = hotels.drop(['reviewer_score'], axis = 1)  
y = hotels['reviewer_score'] 

# Загружаем специальный инструмент для разбивки:  
from sklearn.model_selection import train_test_split  

# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.  
# Для тестирования мы будем использовать 25% от исходного датасета.  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Импортируем необходимые библиотеки:  
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели  
from sklearn import metrics # инструменты для оценки точности модели  
  
# Создаём модель  
regr = RandomForestRegressor(n_estimators=100)  
      
# Обучаем модель на тестовом наборе данных  
regr.fit(X_train, y_train)  
      
# Используем обученную модель для предсказания рейтинга отелей в тестовой выборке.  
# Предсказанные значения записываем в переменную y_pred  
y_pred = regr.predict(X_test) 

# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они отличаются  
# Метрика называется Mean Absolute Percentage Error (MAPE) и показывает среднюю абсолютную процентную ошибку предсказанных значений от фактических.  
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_pred))

MAPE: 0.12598238745282506
