# Загрузка Pandas и очистка данных

In [201]:
import pandas as pd
import numpy as np

In [379]:
df = pd.read_csv('main_task.xls')

In [380]:
# Creating New parameter, showing the availability of special (veg,vegan, gluten-free) food in a restaurant: 1 if available, 0 if not
df['Cuisine Style'] = df['Cuisine Style'].str.replace('[','')
df['Cuisine Style'] = df['Cuisine Style'].str.replace(']','')
df['Cuisine Style'] = df['Cuisine Style'].str.replace(' ','')
df['Cuisine Style'] = df['Cuisine Style'].str.replace("'" ,'')
df['Veg'] = (df['Cuisine Style'].str.contains('VegetarianFriendly'))|(df['Cuisine Style'].str.contains('GlutenFreeOptions'))|(df['Cuisine Style'].str.contains('VeganOptions'))
df['Veg'] = df['Veg'].apply(lambda x: 1 if x==True else 0)

In [381]:
#Creating a new dummy based of df 'City' column
A = pd.DataFrame(df['City'].value_counts())
A.reset_index(inplace=True)
A = A.rename(columns = {'index':'City', 'City':'Qty of rests'})
Top_cities = A[A['Qty of rests']>A['Qty of rests'].quantile(0.1)]['City']
all_cities = pd.DataFrame(df['City'].value_counts()).index
top_cities = list(all_cities)[:len(Top_cities)]
cities_to_throw_away = list(set(all_cities) - set(top_cities))
df.loc[df['City'].isin(cities_to_throw_away), 
             'City'] = 'other Cities'
Dummies_City = pd.get_dummies(df.City, drop_first=True)
df = pd.merge(df, Dummies_City, how='left', left_index = True, right_index = True)

In [382]:
# Creating a New dummy parameter, based on df['Cuisine Style'] column
df['Pop Cuisine'] = (df['Cuisine Style'].str.contains('European'))|(df['Cuisine Style'].str.contains('Mediterranean'))|(df['Cuisine Style'].str.contains('Italian'))
df['Pop Cuisine'] = df['Pop Cuisine'].apply(lambda x: 'Most popular Cuisine' if x==True else 'Other Cuisine')
Dummies_Cuisine = pd.get_dummies(df['Pop Cuisine'], drop_first=True)
df = pd.merge(df, Dummies_Cuisine, how='left', left_index = True, right_index = True)

In [383]:
# Creating new feature parameter, showing the number of cuisines featured in a particular restaurant
df['Cuisine Style'].fillna('dif', inplace = True)
df['Number of Cuisines'] = df['Cuisine Style'].apply(lambda x: x.count(',')) #counting by the number of commas in the list

In [384]:
# Creating new feature parameter, modifying existing 'Ranking' parameter
# by diving on the quantity of the restaurants in a particular City of dataset
A = pd.DataFrame(df['City'].value_counts())
A.reset_index(inplace=True)
A = A.rename(columns = {'index':'City', 'City':'Qty of rests'})
df = pd.merge(df, A, on='City', how='left')
df['Ranking_modified'] = df['Ranking']/df['Qty of rests']

In [385]:
# New parameter, recoding the "Price Range parameter" into numerical from 0 to 2
df['Price Range'].fillna('$$ - $$$', inplace=True)
price_dict = {'$':'0', '$$ - $$$':'1', '$$$$':'2'}
df['Price Range'] = df['Price Range'].replace(to_replace=price_dict)
df['Price Range'] = df['Price Range'].apply(lambda x: float(x))

In [386]:
# Creating new parameter by dividing original "Number of Reviews" by additionally calculated mean number of Reviews within a particular city
df['Number of Reviews'].fillna(df['Number of Reviews'].mean(), inplace = True)
T = pd.DataFrame(df.groupby(['City'])['Number of Reviews'].mean())
T.reset_index(inplace=True)
T = T.rename(columns = {'Number of Reviews':'Mean City NoR'})
df = pd.merge(df, T, on='City', how='left')
df['NoR modified'] = df['Number of Reviews']/df['Mean City NoR']

# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [388]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
y = df['Rating']
for column in X:
    if np.dtype(X[column])=='object':
        X.drop([column], axis=1, inplace = True)

In [389]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [390]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [392]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [393]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [394]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.21193056666666665
