In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

pd.set_option('max_colwidth', 120)
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)

df = pd.read_csv('main_task.xls')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price Range        26114 non-null  object 
 6   Number of Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   URL_TA             40000 non-null  object 
 9   ID_TA              40000 non-null  object 
dtypes: float64(3), object(7)
memory usage: 3.1+ MB


In [3]:
df.head(3)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel restaurant'], ['12/31/2017', '11/20/2017']]",/Restaurant_Review-g187147-d1912643-Reviews-R_Yves-Paris_Ile_de_France.html,d1912643
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food'], ['07/06/2017', '06/19/2016']]",/Restaurant_Review-g189852-d7992032-Reviews-Buddha_Nepal-Stockholm.html,d7992032
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Vegetarian Friendly', 'Vegan Options', 'Gluten Free Options']",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'], ['01/08/2018', '01/06/2018']]",/Restaurant_Review-g186338-d8632781-Reviews-ROKA_Mayfair-London_England.html,d8632781


---------

In [7]:
# Создам словарь с городами и населением в них (в тыс. человек)
population_in_the_сity = {'Paris': 2148, 'Stockholm': 961, 'London': 8908, 'Berlin': 3644, 'Munich': 1471, 'Oporto': 237,
                          'Milan': 1378, 'Bratislava': 437, 'Vienna': 1897, 'Rome': 2870, 'Barcelona': 1664, 'Madrid': 3266,
                          'Dublin': 1173, 'Brussels': 179, 'Zurich': 428, 'Warsaw': 1790, 'Budapest': 1752, 'Copenhagen': 615,
                          'Amsterdam': 872, 'Lyon': 506, 'Hamburg': 1841, 'Lisbon': 505, 'Prague': 1301, 'Oslo': 673,
                          'Helsinki': 655, 'Edinburgh': 488, 'Geneva': 200, 'Ljubljana': 284, 'Athens': 664,
                          'Luxembourg': 602, 'Krakow': 779}

In [8]:
df['Population'] = df['City'].apply(lambda x: population_in_the_сity[x])  # Создание столбца с населением в городе

In [6]:
df.sample(3)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Population
3504,id_7875,Paris,,7876.0,4.5,,4.0,"[['Flaneur 2.'], ['11/30/2017']]",/Restaurant_Review-g187147-d12076815-Reviews-Vivaio_Paris-Paris_Ile_de_France.html,d12076815,2148
27196,id_768,Prague,"['Asian', 'Thai', 'Vegetarian Friendly', 'Vegan Options', 'Gluten Free Options']",769.0,4.0,$$ - $$$,135.0,"[['Nice and Spicy!', 'Awesome noodles'], ['12/16/2017', '09/17/2017']]",/Restaurant_Review-g274707-d8142932-Reviews-Modry_Zub_Andel-Prague_Bohemia.html,d8142932,1301
3062,id_2662,Hamburg,,2670.0,2.0,,,"[[], []]",/Restaurant_Review-g187331-d13172157-Reviews-Die_Taverne-Hamburg.html,d13172157,1841


-----------

In [None]:
df['Price Range'].value_counts()

In [None]:
df['Price Range'].isnull().sum()

-------

In [9]:
df.City.unique()  # Список всех городов

array(['Paris', 'Stockholm', 'London', 'Berlin', 'Munich', 'Oporto',
       'Milan', 'Bratislava', 'Vienna', 'Rome', 'Barcelona', 'Madrid',
       'Dublin', 'Brussels', 'Zurich', 'Warsaw', 'Budapest', 'Copenhagen',
       'Amsterdam', 'Lyon', 'Hamburg', 'Lisbon', 'Prague', 'Oslo',
       'Helsinki', 'Edinburgh', 'Geneva', 'Ljubljana', 'Athens',
       'Luxembourg', 'Krakow'], dtype=object)

In [10]:
city_df = pd.get_dummies(df.City)  # Делаем дамми-признаки всех городов
city_df

Unnamed: 0,Amsterdam,Athens,Barcelona,Berlin,Bratislava,Brussels,Budapest,Copenhagen,Dublin,Edinburgh,Geneva,Hamburg,Helsinki,Krakow,Lisbon,Ljubljana,London,Luxembourg,Lyon,Madrid,Milan,Munich,Oporto,Oslo,Paris,Prague,Rome,Stockholm,Vienna,Warsaw,Zurich
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
39996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
39997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
39998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [11]:
df = pd.concat([df, city_df], axis=1)  # Добавляем дамми-города в таблицу
df.sample(3)

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,URL_TA,ID_TA,Population,Amsterdam,Athens,Barcelona,Berlin,Bratislava,Brussels,Budapest,Copenhagen,Dublin,Edinburgh,Geneva,Hamburg,Helsinki,Krakow,Lisbon,Ljubljana,London,Luxembourg,Lyon,Madrid,Milan,Munich,Oporto,Oslo,Paris,Prague,Rome,Stockholm,Vienna,Warsaw,Zurich
15459,id_2821,Brussels,,2822.0,3.0,,11.0,"[['Very satisfying. Value priced.', 'Howful'], ['10/18/2015', '06/07/2015']]",/Restaurant_Review-g188644-d7290704-Reviews-La_Iberica-Brussels.html,d7290704,179,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37462,id_21,Krakow,"['Italian', 'Polish', 'Mediterranean', 'European', 'Central European', 'Vegetarian Friendly', 'Vegan Options']",22.0,4.5,$$ - $$$,760.0,"[['Nice place but main course were poorly exe...', 'Definitely will come back!'], ['01/06/2018', '12/21/2017']]",/Restaurant_Review-g274772-d2369590-Reviews-Restauracja_Padre-Krakow_Lesser_Poland_Province_Southern_Poland.html,d2369590,779,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19379,id_63,Prague,"['Mexican', 'Latin', 'Central American', 'International', 'Vegetarian Friendly', 'Vegan Options', 'Gluten Free Optio...",64.0,4.5,$$ - $$$,1008.0,"[['Delicious Mexican cuisine.', 'Excellent Mexican restaurant'], ['01/07/2018', '01/06/2018']]",/Restaurant_Review-g274707-d7209419-Reviews-Agave_Restaurant-Prague_Bohemia.html,d7209419,1301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


------------

In [None]:
df = df.drop(['Restaurant_id', 'City', 'Cuisine Style', 'Price Range', 'Reviews', 'URL_TA', 'ID_TA'], axis = 1)
df = df.fillna(0)

In [None]:
X = df.drop(['Rating'], axis = 1)  
y = df['Rating'] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [None]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [None]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))