# Restoran Rating Forecast Model

## Загрузка Pandas и очистка данных

In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime

In [2]:
import re

In [3]:
from textblob import TextBlob

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

### Раздел для декларации вспомогательных функций

In [5]:
def clean_spaces(df):
    bad_names = []
    substs = {}
    for col in list(df.columns):
        if ' ' in col:
            bad_names.append(col)
    for col in bad_names:
        substs[col] = '_'.join(col.split())
    # substs
    df.rename(columns=substs, inplace=True)
    return df

In [6]:
#Функция обучения как в постановке задачи
def RFR_test(df):
    was = time.perf_counter()
    X = df.drop(['Restaurant_id', 'Rating'], axis=1)
    y = df['Rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=17)
    regr = RandomForestRegressor(n_estimators=100, verbose=1, n_jobs=-1, random_state=17)
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)
    y_pred2 = (y_pred * 2).round() / 2
    print('Длительность:', time.perf_counter() - was, 'sec')
    print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
    print('MAE2:', metrics.mean_absolute_error(y_test, y_pred2))

### Обработка исходных данных

In [7]:

df = pd.read_csv('main_task_new.csv')
cities = pd.read_csv('Cities.csv')    # Это мой файл дополнительных данных

In [8]:
joined = df.merge(cities, on='City', how='left')   #исходный набор обогащен дополнительными полями

In [10]:

df = joined.drop(['URL_TA','ID_TA'],axis=1)
df.head()

Unnamed: 0,Restaurant_id,City,Cuisine Style,Ranking,Rating,Price Range,Number of Reviews,Reviews,Country_Code,Population
0,id_5569,Paris,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",FRA,2140526
1,id_1535,Stockholm,,1537.0,4.0,,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",SWE,975551
2,id_352,London,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",GBR,9126366
3,id_3456,Berlin,,3458.0,5.0,,3.0,"[[], []]",DEU,3748148
4,id_615,Munich,"['German', 'Central European', 'Vegetarian Fri...",621.0,4.0,$$ - $$$,84.0,"[['Best place to try a Bavarian food', 'Nice b...",DEU,1471508


Первоначальная версия датасета состоит из десяти столбцов, содержащих следующую информацию:

Restaurant_id — идентификационный номер ресторана / сети ресторанов;
City — город, в котором находится ресторан;
Cuisine Style — кухня или кухни, к которым можно отнести блюда, предлагаемые в ресторане;
Ranking — место, которое занимает данный ресторан среди всех ресторанов своего города;
Rating — рейтинг ресторана по данным TripAdvisor (именно это значение должна будет предсказывать модель);
Price Range — диапазон цен в ресторане;
Number of Reviews — количество отзывов о ресторане;
Reviews — данные о двух отзывах, которые отображаются на сайте ресторана;
URL_TA — URL страницы ресторана на TripAdvisor;
ID_TA — идентификатор ресторана в базе данных TripAdvisor.
Есть какая-то ошибка в данных в выгрузке из Мадрида. Там примерно 20 дубликатов в полях URL_TA и ID_TA. Заполнить пропуски мне это не поможет.  Если бы умел парсить сайты - можно было бы подтянуть информацию из tripadvisor.com Но пока не умею! ((

Я добавил два столбца - Country_Code и Population. И сразу сбросил 'URL_TA','ID_TA'.  Знал бы сейчас парсинг сайтов - подтянул бы данные с сайта TA.

Несколько колонок имеют пробелы внутри названия и подлежат переименованию

In [11]:
df = clean_spaces(df)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine_Style      30717 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price_Range        26114 non-null  object 
 6   Number_of_Reviews  37457 non-null  float64
 7   Reviews            40000 non-null  object 
 8   Country_Code       40000 non-null  object 
 9   Population         40000 non-null  int64  
dtypes: float64(3), int64(1), object(6)
memory usage: 3.4+ MB


Несколько столбцов имеют явные пропуски значений -  . Убираю их самыми частыми или средними в наборе.

In [13]:
# Заполняю пропуски в Cuisine_Style, Price_Range,Number_of_Reviews не обращая внимания на город
df['Cuisine_Style'].fillna(df['Cuisine_Style'].value_counts().idxmax(), inplace=True)
df['Price_Range'].fillna(df['Price_Range'].value_counts().idxmax(), inplace=True)
df['Number_of_Reviews'].fillna(df['Number_of_Reviews'].median(),inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 0 to 39999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Restaurant_id      40000 non-null  object 
 1   City               40000 non-null  object 
 2   Cuisine_Style      40000 non-null  object 
 3   Ranking            40000 non-null  float64
 4   Rating             40000 non-null  float64
 5   Price_Range        40000 non-null  object 
 6   Number_of_Reviews  40000 non-null  float64
 7   Reviews            40000 non-null  object 
 8   Country_Code       40000 non-null  object 
 9   Population         40000 non-null  int64  
dtypes: float64(3), int64(1), object(6)
memory usage: 3.4+ MB


Превращаю номинативный столбец с названиями городов в набор столбцов датафрейма с названиями городов. Одновременно в зависимости от города заполняются пропуски в столбцах 'Cuisine_Style', 'Price_Range' модой для города и 'Number_of_Reviews' медианой для города.


In [14]:
# Разворачивание номинативного столбца в набор индексных
city_columns = pd.get_dummies(df['City'])
df = df.join(city_columns).drop('City',axis=1)
df.head(3)

Unnamed: 0,Restaurant_id,Cuisine_Style,Ranking,Rating,Price_Range,Number_of_Reviews,Reviews,Country_Code,Population,Amsterdam,...,Munich,Oporto,Oslo,Paris,Prague,Rome,Stockholm,Vienna,Warsaw,Zurich
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",FRA,2140526,0,...,0,0,0,1,0,0,0,0,0,0
1,id_1535,['Italian'],1537.0,4.0,$$ - $$$,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",SWE,975551,0,...,0,0,0,0,0,0,1,0,0,0
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",GBR,9126366,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
country_columns = pd.get_dummies(df['Country_Code'])
df = df.join(country_columns).drop('Country_Code',axis=1)
df.head(3)

Unnamed: 0,Restaurant_id,Cuisine_Style,Ranking,Rating,Price_Range,Number_of_Reviews,Reviews,Population,Amsterdam,Athens,...,IRL,ITA,LUX,NLD,NOR,POL,PRT,SVK,SVN,SWE
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",2140526,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_1535,['Italian'],1537.0,4.0,$$ - $$$,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",975551,0,0,...,0,0,0,0,0,0,0,0,0,1
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",9126366,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df['Log_Population']=np.log10(df.Population)
df = df.drop(['Population'],axis=1)
df.head(3)

Unnamed: 0,Restaurant_id,Cuisine_Style,Ranking,Rating,Price_Range,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,...,ITA,LUX,NLD,NOR,POL,PRT,SVK,SVN,SWE,Log_Population
0,id_5569,"['European', 'French', 'International']",5570.0,3.5,$$ - $$$,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,...,0,0,0,0,0,0,0,0,0,6.330521
1,id_1535,['Italian'],1537.0,4.0,$$ - $$$,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,...,0,0,0,0,0,0,0,0,1,5.98925
2,id_352,"['Japanese', 'Sushi', 'Asian', 'Grill', 'Veget...",353.0,4.5,$$$$,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,...,0,0,0,0,0,0,0,0,0,6.960298


In [17]:
# Превращаю поле Cuisine_Style в набор индексов и одну количественную переменную
# Вспомогательная функция для разбора текстового списка
def str2list(s):
    return [ l.strip().strip("'") for l in s.strip('][').split(',') ]
# ожидается громозкая операция. Сколько займет ее исполнение?
was = time.perf_counter()
# Вспомогательный столбец со списком кухонь
df['CStyles'] = df['Cuisine_Style'].apply(str2list)
# СStyles - вспомогательный датафрейм для использования explode
# В одной ячейке может содержаться несколько названий кухонь,  поэтому сначала делаю их список через explode
# а потом добавляю столбцы с названиями и превращаю их в индексы
CStyles = df['Cuisine_Style'].unique()
CStyles = pd.Series(CStyles)
CStyles = CStyles.apply(str2list)
CStyles = CStyles.explode()
# CNames - список кухонь
CNames = CStyles.unique()
# CNames - список названий кухонь, превращаю в набор индексных переменных столбцов
for st in CNames:
    df[st] = df['CStyles'].apply(lambda x: 1 if st in x else 0)
df['Num_Cuisines'] = df['CStyles'].apply(len) # это уже дамми переменная с количеством разных кухонь
print('Разбор занял',time.perf_counter()-was,'sec')

Разбор занял 2.652313497999998 sec


In [19]:
df = df.drop(['CStyles', 'Cuisine_Style'], axis=1)
df = clean_spaces(df) # в названиях кухонь встречаются пробелы
df.head(3)

KeyError: "['CStyles' 'Cuisine_Style'] not found in axis"

In [20]:
testdf=df.drop(['Price_Range','Reviews'],axis=1)
RFR_test(testdf)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s


Длительность: 11.109283132000002 sec
MAE: 0.2118685
MAE2: 0.182


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [21]:
testdf = df.drop(['Reviews'],axis=1)
testdf.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Price_Range,Number_of_Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Salvadoran,Yunnan,Native_American,Canadian,Xinjiang,Burmese,Fujian,Welsh,Latvian,Num_Cuisines
0,id_5569,5570.0,3.5,$$ - $$$,194.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,id_1535,1537.0,4.0,$$ - $$$,10.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,id_352,353.0,4.5,$$$$,688.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7


In [22]:
# Словарик и преобразование диапазона цен в набор индексов
PriceRanges = { '$':'Low', '$$$$': 'High', '$$ - $$$': 'Mid' }
testdf['PRange']= testdf['Price_Range'].apply(lambda c: PriceRanges[c])
pr_columns = pd.get_dummies(testdf['PRange'])
testdf = testdf.join(pr_columns).drop(['Price_Range','PRange'],axis=1)
testdf.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,Brussels,...,Canadian,Xinjiang,Burmese,Fujian,Welsh,Latvian,Num_Cuisines,High,Low,Mid
0,id_5569,5570.0,3.5,194.0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,0,1
1,id_1535,1537.0,4.0,10.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,id_352,353.0,4.5,688.0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,1,0,0


In [23]:

RFR_test(testdf)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s


Длительность: 10.421321922000004 sec
MAE: 0.211796
MAE2: 0.1833


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [24]:
testdf = df.drop(['Reviews'],axis=1)
testdf.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Price_Range,Number_of_Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Salvadoran,Yunnan,Native_American,Canadian,Xinjiang,Burmese,Fujian,Welsh,Latvian,Num_Cuisines
0,id_5569,5570.0,3.5,$$ - $$$,194.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
1,id_1535,1537.0,4.0,$$ - $$$,10.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,id_352,353.0,4.5,$$$$,688.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7


In [25]:
# Словарик и преобразование диапазона цен в набор значений
# Проверка показала что это более эффективное решение, чем one_hot кодирование. Быстрее и ошибки меньше.
PriceRanges = { '$':1.0, '$$$$': 3.0, '$$ - $$$': 2.0 }
df['PRange']= df['Price_Range'].apply(lambda c: PriceRanges[c])
df = df.drop(['Price_Range'],axis=1)
df.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Yunnan,Native_American,Canadian,Xinjiang,Burmese,Fujian,Welsh,Latvian,Num_Cuisines,PRange
0,id_5569,5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2.0
1,id_1535,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.0
2,id_352,353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,3.0


In [26]:
testdf=df.drop(['Reviews'],axis=1)
testdf.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,Brussels,...,Yunnan,Native_American,Canadian,Xinjiang,Burmese,Fujian,Welsh,Latvian,Num_Cuisines,PRange
0,id_5569,5570.0,3.5,194.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2.0
1,id_1535,1537.0,4.0,10.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,2.0
2,id_352,353.0,4.5,688.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,3.0


In [27]:
RFR_test(testdf)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s


Длительность: 10.706079412999998 sec
MAE: 0.2113025
MAE2: 0.18235


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


Вывод - набор индексных переменных иногда хуже, чем единственная количественная упорядоченная переменная.

In [28]:
def strip(text):
    return text[1:-1]

In [29]:
def firstsplit(text):
    text = strip(text)
    # print(text)
    if '],' in text:
        pos = text.rfind('],')
        return strip(text[:pos+1]), strip(text[pos+3:])
    else:
        return '', ''

In [30]:
def nextsplit(text):
    if '", ' in text:
        pos = text.find('", ')
    elif "', " in text:
        pos = text.find("', ")
    else:
        return strip(text), ''
    return strip(text[:pos+1]), strip(text[pos+3:])

In [31]:
def split_review(text):
    t,d = firstsplit(text)
    t1, t2 = nextsplit(t); d1, d2 = nextsplit(d)
    nr = 2 if len(t1) > 0 and len(t2) > 0 else ( 1 if len(t1) > 0 else 0 )
    return [nr, t1, t2, d1, d2]

In [68]:
# Разбор колонки отзывов в два столбца со списком значений отзывов и
# с количеством отзывов для будущего использоваиния
df['RL'] = df['Reviews'].apply(split_review)
df['Num_Disp_Rev'] = df['RL'].apply(lambda x: x[0])
df.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Num_Cuisines,PRange,Num_Disp_Rev,Review_polarity,Review_polarity_variation,Review_subjectivity,Review_subjectivity_variation,Recent_Review_date,Review_dates_span,RL
0,id_5569,5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,0,0,...,3,2.0,2,0.7,0.0,0.6,0.0,736694.0,41,"[2, Good food at your doorstep, A good hotel r..."
1,id_1535,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,0,0,...,1,2.0,2,0.6875,0.3125,1.0,0.0,736516.0,382,"[2, Unique cuisine, Delicious Nepalese food, 0..."
2,id_352,353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,0,0,...,7,3.0,2,-0.166667,0.166667,0.5,0.5,736702.0,2,"[2, Catch up with friends, Not exceptional, 01..."


In [244]:
# Вывод значений обзора и его даты не нужен для дальнейшего использования.
df['RT1']= df['RL'].apply(lambda x: x[1])     #Текст обзора 1
df['RT2']= df['RL'].apply(lambda x: x[2])     #Текст обзора 2
df['RDT1'] = df['RL'].apply(lambda x: x[3])   #Дата обзора 1
df['RDT2'] = df['RL'].apply(lambda x: x[4])   #Дата обзора 2

df.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Welsh,Latvian,Num_Cuisines,PRange,RL,Num_Disp_Rev,RT1,RT2,RDT1,RDT2
0,id_5569,5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,0,0,...,0,0,3,2.0,"[2, Good food at your doorstep, A good hotel r...",2,Good food at your doorstep,A good hotel restaurant,12/31/2017,11/20/2017
1,id_1535,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,0,0,...,0,0,1,2.0,"[2, Unique cuisine, Delicious Nepalese food, 0...",2,Unique cuisine,Delicious Nepalese food,07/06/2017,06/19/2016
2,id_352,353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,0,0,...,0,0,7,3.0,"[2, Catch up with friends, Not exceptional, 01...",2,Catch up with friends,Not exceptional,01/08/2018,01/06/2018


In [250]:
df = df.drop(['RT1', 'RT2', 'RDT1', 'RDT2'] ,axis=1)
df.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,PRange,RL,Num_Disp_Rev,Review_polarity,Review_polarity_variation,Review_subjectivity,Review_subjectivity_variation,RDAL,Recent_Review_date,Review_dates_span
0,id_5569,5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,0,0,...,2.0,"[2, Good food at your doorstep, A good hotel r...",2,0.7,0.0,0.6,0.0,"[736694, 41]",736694,41
1,id_1535,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,0,0,...,2.0,"[2, Unique cuisine, Delicious Nepalese food, 0...",2,0.6875,0.3125,1.0,0.0,"[736516, 382]",736516,382
2,id_352,353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,0,0,...,3.0,"[2, Catch up with friends, Not exceptional, 01...",2,-0.166667,0.166667,0.5,0.5,"[736702, 2]",736702,2


In [33]:
# Анализ текста с целью выясвления отношения и субьективности написавшего.
# Использую усредненные значения по двум отзвам, либо единственное значение по одному, либо все по нулям.
# Еще считается характеристика разброса значений отношения и субьективности - если отзывов два.
def analyse_texts(RevList):
    # RevList[0] - количество опубликованных отзывов
    # RevList[1] - текст отзыва 1
    # RevList[2] - текст отзыва 2
    if RevList[0] == 2:
        tb1 = TextBlob(RevList[1])
        # TextBlob - simples method for sentiment analysis
        p1 = tb1.sentiment.polarity
        s1 = tb1.sentiment.subjectivity
        # The sentiment property returns a namedtuple of the form Sentiment(polarity, subjectivity).
        # The polarity score is a float within the range [-1.0, 1.0].
        # The subjectivity is a float within the range [0.0, 1.0]
        # where 0.0 is very objective and 1.0 is very subjective.
        # print(RevList[1],p1,s1)
        tb2 = TextBlob(RevList[2])
        p2 = tb2.sentiment.polarity
        s2 = tb2.sentiment.subjectivity
        # print(RevList[2],p2,s2)
        p = (p1+p2)/2      #средняя полярность отзывов
        dp = abs(p2-p1)/2  #разброс полярностей отзывов
        s = (s1+s2)/2      #средняя субьективность отзывов
        ds = abs(s2-s1)/2  #разброс субьективностей отзывов
        return [p, dp, s, ds]
    elif RevList[0] == 1:
        tb1 = TextBlob(RevList[1])
        return [tb1.sentiment.polarity, 0.0, tb1.sentiment.subjectivity, 0.0]
    else:
        return[0.0, 0.0, 0.0, 0.0]

In [34]:
df['TAL'] = df['RL'].apply(analyse_texts)
df['Review_polarity'] = df['TAL'].apply(lambda x: x[0])
df['Review_polarity_variation'] = df['TAL'].apply(lambda x: x[1])
df['Review_subjectivity'] = df['TAL'].apply(lambda x: x[2])
df['Review_subjectivity_variation'] = df['TAL'].apply(lambda x: x[3])
df = df.drop(['TAL'], axis=1)
df.head(3)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Welsh,Latvian,Num_Cuisines,PRange,RL,Num_Disp_Rev,Review_polarity,Review_polarity_variation,Review_subjectivity,Review_subjectivity_variation
0,id_5569,5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,0,0,...,0,0,3,2.0,"[2, Good food at your doorstep, A good hotel r...",2,0.7,0.0,0.6,0.0
1,id_1535,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,0,0,...,0,0,1,2.0,"[2, Unique cuisine, Delicious Nepalese food, 0...",2,0.6875,0.3125,1.0,0.0
2,id_352,353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,0,0,...,0,0,7,3.0,"[2, Catch up with friends, Not exceptional, 01...",2,-0.166667,0.166667,0.5,0.5


In [75]:
def analyse_review_dates(RevList):
    # RevList[0] - количество опубликованных отзывов
    # RevList[3] - дата отзыва 1
    # RevList[4] - дата отзыва 2
    if RevList[3] == '':
        c1 = 0
    else:
        c1 = datetime.strptime(RevList[3],"%m/%d/%Y").date().toordinal()
    if RevList[4] == '':
        c2 = 0
    else:
        c2 = datetime.strptime(RevList[4],"%m/%d/%Y").date().toordinal()
    return [ (c2 if c2>c1 else c1), (abs(c2-c1) if c2>0 else 0) ]

In [76]:
def check_dates(RevList):
    if RevList[0] == 2:
        if (RevList[3]=='') or (RevList[4]==''):
            print(RevList)
            return 'Err'
        else:
            return 'Ok'
    elif (RevList[0] ==1) and (RevList[3]==''):
        print(RevList)
        return 'Err'
    return 'Ok'

In [89]:
df['RDAL'] = df['RL'].apply(analyse_review_dates)
df['Recent_Review_date'] = df['RDAL'].apply(lambda x: x[0])
df['Review_dates_span'] = df['RDAL'].apply(lambda x: x[1])
df = df.drop(['RDAL'],axis=1)
df.head(5)

Unnamed: 0,Restaurant_id,Ranking,Rating,Number_of_Reviews,Reviews,Amsterdam,Athens,Barcelona,Berlin,Bratislava,...,Num_Cuisines,PRange,Num_Disp_Rev,Review_polarity,Review_polarity_variation,Review_subjectivity,Review_subjectivity_variation,Recent_Review_date,Review_dates_span,RL
0,id_5569,5570.0,3.5,194.0,"[['Good food at your doorstep', 'A good hotel ...",0,0,0,0,0,...,3,2.0,2,0.7,0.0,0.6,0.0,736694,41,"[2, Good food at your doorstep, A good hotel r..."
1,id_1535,1537.0,4.0,10.0,"[['Unique cuisine', 'Delicious Nepalese food']...",0,0,0,0,0,...,1,2.0,2,0.6875,0.3125,1.0,0.0,736516,382,"[2, Unique cuisine, Delicious Nepalese food, 0..."
2,id_352,353.0,4.5,688.0,"[['Catch up with friends', 'Not exceptional'],...",0,0,0,0,0,...,7,3.0,2,-0.166667,0.166667,0.5,0.5,736702,2,"[2, Catch up with friends, Not exceptional, 01..."
3,id_3456,3458.0,5.0,3.0,"[[], []]",0,0,0,1,0,...,1,2.0,0,0.0,0.0,0.0,0.0,0,0,"[0, , , , ]"
4,id_615,621.0,4.0,84.0,"[['Best place to try a Bavarian food', 'Nice b...",0,0,0,0,0,...,3,2.0,2,0.8,0.2,0.65,0.35,736651,272,"[2, Best place to try a Bavarian food, Nice bu..."


In [91]:
RFR_test(df.drop(['Reviews','RL'],axis=1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.9s


Длительность: 12.796834852999382 sec
MAE: 0.21311950000000002
MAE2: 0.181


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [92]:
#Удивительным образом заполнение пропусков даты обзора медианой а не нулем дает чуть лучшее значение.
# А вот нормализация значений заметным образом не сказывается на итоге.
Med_Rec_Rev_Date = df.loc[df.Recent_Review_date>0,'Recent_Review_date'].median()
df['Recent_Review_date'] = df['Recent_Review_date'].apply(lambda x: x if x > 0 else Med_Rec_Rev_Date)

In [93]:
RFR_test(df.drop(['Reviews','RL'],axis=1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    5.3s


Длительность: 15.310309649000374 sec
MAE: 0.2133255
MAE2: 0.1807


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   15.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


In [100]:
RFR_test(df.drop(['Reviews','RL','Review_polarity_variation', 'Review_subjectivity_variation'],axis=1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.5s


Длительность: 13.547159116999865 sec
MAE: 0.21265749999999997
MAE2: 0.17995


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [168]:
# Ваш код по очистке данных и генерации новых признаков
# При необходимости добавьте ячейки

## Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [169]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
y = df['Rating']

In [170]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [171]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=17 )

# Создаём, обучаем и тестируем модель

In [172]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [173]:
# время до начала обучения
was = time.perf_counter()
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)
print('Обучение заняло',time.perf_counter()-was,'sec')

Обучение заняло 33.36827244999586 sec


In [174]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.214652


In [175]:
y_pred2 = (y_pred*2).round()/2
print('MAE2:', metrics.mean_absolute_error(y_test, y_pred2) )

MAE2: 0.18695
