In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from scipy.stats import ttest_ind

pd.set_option('display.max_rows', 50) # показывать больше строк
pd.set_option('display.max_columns', 50) # показывать больше колонок

In [2]:
df = pd.read_csv('main_task.csv')

Restaurant_id — идентификационный номер ресторана / сети ресторанов;
City — город, в котором находится ресторан;
Cuisine Style — кухня или кухни, к которым можно отнести блюда, предлагаемые в ресторане;
Ranking — место, которое занимает данный ресторан среди всех ресторанов своего города;
Rating — рейтинг ресторана по данным TripAdvisor (именно это значение должна будет предсказывать модель);
Price Range — диапазон цен в ресторане;
Number of Reviews — количество отзывов о ресторане;
Reviews — данные о двух отзывах, которые отображаются на сайте ресторана;
URL_TA — URL страницы ресторана на TripAdvosor;
ID_TA — идентификатор ресторана в базе данных TripAdvisor.

In [3]:
df.head()

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


In [4]:
df.shape

(40000, 10)

In [5]:
df.isna().sum()

Restaurant_id            0
City                     0
Cuisine Style         9283
Ranking                  0
Rating                   0
Price Range          13886
Number of Reviews     2543
Reviews                  0
URL_TA                   0
ID_TA                    0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


In [7]:
df.dtypes


Restaurant_id         object
City                  object
Cuisine Style         object
Ranking              float64
Rating               float64
Price Range           object
Number of Reviews    float64
Reviews               object
URL_TA                object
ID_TA                 object
dtype: object

In [8]:
pd.DataFrame(df['Price Range'].value_counts())

Unnamed: 0,Price Range
$$ - $$$,18412
$,6279
$$$$,1423


In [9]:
df.drop(['Price Range'], inplace = True, axis = 1)

In [10]:
df.head()

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963


Произведем замену NaN  в столбце 'Number of Reviews' занчениями медианы.

In [11]:
c = df['Number of Reviews'].median()

In [12]:
v = {'Number of Reviews': c}

In [13]:

df.fillna(value = v, inplace = True)
df

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643
1,id_1535,Stockholm,,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781
3,id_3456,Berlin,,3458.0,5.0,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963
...,...,...,...,...,...,...,...,...,...
39995,id_499,Milan,"['Italian', 'Vegetarian Friendly', 'Vegan Opti...",500.0,4.5,79.0,"[['The real Italian experience!', 'Wonderful f...",/Restaurant_Review-g187849-d2104414-Reviews-Ro...,d2104414
39996,id_6340,Paris,"['French', 'American', 'Bar', 'European', 'Veg...",6341.0,3.5,542.0,"[['Parisian atmosphere', 'Bit pricey but inter...",/Restaurant_Review-g187147-d1800036-Reviews-La...,d1800036
39997,id_1649,Stockholm,"['Japanese', 'Sushi']",1652.0,4.5,4.0,"[['Good by swedish standards', 'A hidden jewel...",/Restaurant_Review-g189852-d947615-Reviews-Sus...,d947615
39998,id_640,Warsaw,"['Polish', 'European', 'Eastern European', 'Ce...",641.0,4.0,70.0,"[['Underground restaurant', 'Oldest Restaurant...",/Restaurant_Review-g274856-d1100838-Reviews-Ho...,d1100838


Проверим 

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Number of Reviews  40000 non-null  float64
 6   Reviews            40000 non-null  object 
 7   URL_TA             40000 non-null  object 
 8   ID_TA              40000 non-null  object 
dtypes: float64(3), object(6)
memory usage: 2.7+ MB


Restaurant_id — идентификационный номер ресторана / сети ресторанов;
City — город, в котором находится ресторан;
Cuisine Style — кухня или кухни, к которым можно отнести блюда, предлагаемые в ресторане;
Ranking — место, которое занимает данный ресторан среди всех ресторанов своего города;
Rating — рейтинг ресторана по данным TripAdvisor (именно это значение должна будет предсказывать модель);
Price Range — диапазон цен в ресторане;
Number of Reviews — количество отзывов о ресторане;
Reviews — данные о двух отзывах, которые отображаются на сайте ресторана;
URL_TA — URL страницы ресторана на TripAdvosor;
ID_TA — идентификатор ресторана в базе данных TripAdvisor.

для замены NaN в столбце Cuisine Style найдем название наиболее часто встречающейся кухни

In [15]:
pd.DataFrame(df['Cuisine Style'].value_counts())

Unnamed: 0,Cuisine Style
['Italian'],1032
['French'],805
['Spanish'],695
"['French', 'European']",405
['Cafe'],403
...,...
"['European', 'British', 'Scottish', 'Gluten Free Options', 'Vegetarian Friendly', 'Vegan Options']",1
"['Mexican', 'Delicatessen', 'Healthy', 'Gastropub', 'Vegetarian Friendly', 'Vegan Options']",1
"['Gluten Free Options', 'Vegan Options']",1
"['Gastropub', 'Irish', 'European', 'Pub', 'Vegetarian Friendly', 'Gluten Free Options']",1


сделаем замену значений NaN в столбце на  Italian

In [16]:
m = {'Cuisine Style': 'Italian'}
df.fillna(value = m, inplace = True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      40000 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Number of Reviews  40000 non-null  float64
 6   Reviews            40000 non-null  object 
 7   URL_TA             40000 non-null  object 
 8   ID_TA              40000 non-null  object 
dtypes: float64(3), object(6)
memory usage: 2.7+ MB


проверим

In [18]:
df.isna().sum()

Restaurant_id        0
City                 0
Cuisine Style        0
Ranking              0
Rating               0
Number of Reviews    0
Reviews              0
URL_TA               0
ID_TA                0
dtype: int64

In [19]:
df = pd.get_dummies(df, columns=[ 'City',], dummy_na=True)

In [20]:
df.head()

Unnamed: 0,Restaurant_id,Cuisine Style,Ranking,Rating,Number of Reviews,Reviews,URL_TA,ID_TA,City_Amsterdam,City_Athens,City_Barcelona,City_Berlin,City_Bratislava,City_Brussels,City_Budapest,City_Copenhagen,City_Dublin,City_Edinburgh,City_Geneva,City_Hamburg,City_Helsinki,City_Krakow,City_Lisbon,City_Ljubljana,City_London,City_Luxembourg,City_Lyon,City_Madrid,City_Milan,City_Munich,City_Oporto,City_Oslo,City_Paris,City_Prague,City_Rome,City_Stockholm,City_Vienna,City_Warsaw,City_Zurich,City_nan
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",/Restaurant_Review-g187147-d1912643-Reviews-R_...,d1912643,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,id_1535,Italian,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",/Restaurant_Review-g189852-d7992032-Reviews-Bu...,d7992032,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",/Restaurant_Review-g186338-d8632781-Reviews-RO...,d8632781,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,id_3456,Italian,3458.0,5.0,3.0,"[[], []]",/Restaurant_Review-g187323-d1358776-Reviews-Es...,d1358776,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,id_615,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,84.0,"[['Best place to try a Bavarian food', 'Nice b...",/Restaurant_Review-g187309-d6864963-Reviews-Au...,d6864963,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [21]:
pd.DataFrame(df['Restaurant_id'].value_counts())

Unnamed: 0,Restaurant_id
id_436,18
id_227,18
id_871,18
id_633,18
id_430,17
...,...
id_9359,1
id_7232,1
id_11276,1
id_8356,1


In [23]:
pd.DataFrame(df['Cuisine Style'].value_counts())

Unnamed: 0,Cuisine Style
Italian,9283
['Italian'],1032
['French'],805
['Spanish'],695
"['French', 'European']",405
...,...
"['Italian', 'Pizza', 'Mediterranean', 'Seafood', 'Wine Bar', 'Vegetarian Friendly', 'Vegan Options', 'Gluten Free Options']",1
"['Seafood', 'British', 'Street Food', 'Scottish', 'Fast Food', 'Vegetarian Friendly']",1
"['Italian', 'Pizza', 'Cafe', 'European', 'Gastropub', 'Vegetarian Friendly']",1
"['International', 'European', 'Czech', 'Vegetarian Friendly', 'Vegan Options']",1


In [24]:
pd.DataFrame(df['Ranking'].value_counts())

Unnamed: 0,Ranking
1080.0,19
437.0,18
491.0,18
345.0,18
431.0,18
...,...
14711.0,1
9409.0,1
14109.0,1
10501.0,1


In [25]:
pd.DataFrame(df['Number of Reviews'].value_counts())

Unnamed: 0,Number of Reviews
33.0,2801
2.0,1916
3.0,1636
4.0,1370
5.0,1181
...,...
2141.0,1
920.0,1
1404.0,1
979.0,1


In [26]:

pd.DataFrame(df['Reviews'].value_counts())

Unnamed: 0,Reviews
"[[], []]",6471
"[['Very good all around', 'Our first Madrid restaurant'], ['11/20/2017', '10/11/2017']]",2
"[['Great Food! Wide Variety.', 'Dinner'], ['11/26/2017', '05/23/2017']]",2
"[['Great food, service and atmosphere', 'Decor and food'], ['11/06/2017', '10/22/2017']]",2
"[['Sat night', 'Can\'t even explain.. ""A"" place.. A ""100...'], ['10/22/2017', '05/11/2017']]",2
...,...
"[['Quickest pizza ever', 'Great'], ['06/09/2017', '10/17/2016']]",1
"[['Poor service', 'Ok on some conditions, details below....'], ['06/26/2016', '04/14/2016']]",1
"[['perfect', 'Best frites in Amsterdam'], ['08/30/2017', '08/25/2017']]",1
"[['Good value for money', 'A Parisian tradition, but know before you...'], ['01/05/2018', '12/31/2017']]",1


In [27]:
pd.DataFrame(df['URL_TA'].value_counts())

Unnamed: 0,URL_TA
/Restaurant_Review-g187514-d10047799-Reviews-Entrepuntos-Madrid.html,2
/Restaurant_Review-g187514-d13002276-Reviews-Delish_Vegan_Doughnuts-Madrid.html,2
/Restaurant_Review-g187514-d4600226-Reviews-Cafeteria_Teresa-Madrid.html,2
/Restaurant_Review-g187514-d11999956-Reviews-SugarCane_Madrid-Madrid.html,2
/Restaurant_Review-g187514-d7342803-Reviews-Los_Hierros-Madrid.html,2
...,...
/Restaurant_Review-g186338-d2006332-Reviews-Poppy_Hana-London_England.html,1
/Restaurant_Review-g947638-d4662709-Reviews-La_Veraison-Watermael_Boitsfort_Brussels.html,1
/Restaurant_Review-g187497-d2056882-Reviews-Yamane_Cafe-Barcelona_Catalonia.html,1
/Restaurant_Review-g186338-d7276967-Reviews-Subway-London_England.html,1


In [28]:

pd.DataFrame(df['ID_TA'].value_counts())

Unnamed: 0,ID_TA
d1315077,2
d11999956,2
d4600226,2
d11698990,2
d7342803,2
...,...
d4176712,1
d3331208,1
d2419405,1
d805156,1


In [29]:
df.drop(['Cuisine Style','Reviews','URL_TA','ID_TA'], inplace = True, axis = 1)

In [30]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
y = df['Rating']

In [31]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [32]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

Создаём, обучаем и тестируем модель

In [33]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [34]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)



In [35]:
# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [36]:
# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [37]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.21596099999999996
