In [1]:
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
import numpy as np 


In [2]:
# Данные с https://www.researchgate.net/publication/308968574_TripAdvisor_Dataset
df = pd.read_csv('ratingswithcontextandmetadata.csv', sep='|', on_bad_lines='skip')

In [3]:
# В данных нет названий колонок, придумываем временные названия
cols = list(map(lambda x: 'c' + str(x), range(1, df.shape[1]+1)))
df.columns = cols
df.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18
0,"Reviewed February 21, 2010",639E347CEF626E6B021161D0BD969ED2,Hotel_Anewandter-Villa_Ottone_South_Tyrol_Prov...,4 of 5 stars,"“Very nice food, great atmosphere, feels like ...",Martin and his staff are truely great! They ma...,"Stayed February 2010, traveled with family",Value: 5 of 5 stars; Location: 3 of 5 stars; S...,Brussels,Contributor,Villa Ottone,Hotel,Bar / Lounge; Beverage Selection; Business Cen...,$$,,25.0,,
1,Reviewed 4 days ago NEW,189A8701882507E21BA7B9F4A55D2E72,Hotel_Excelsior-San_Vigilio_Marebbe_South_Tyro...,5 of 5 stars,“Best Hotel on the Planet”,We have stayed at the Excelsior on numerous oc...,"Stayed February 2014, traveled with family",Value: 5 of 5 stars; Sleep Quality: 5 of 5 sta...,"Singapore, Singapore",Contributor,San Vigilio,Hotel,Bar / Lounge; Beverage Selection; Fitness Cent...,$$$$,,49.0,Our name says it all.Here on the peaceful outs...,
2,"Reviewed September 17, 2014",990C3D220B651C8309A6CF5E0EBFDC05,Hotel_Excelsior-San_Vigilio_Marebbe_South_Tyro...,5 of 5 stars,“Fantastic”,The great reviews are well deserved. Stayed he...,"Stayed August 2014, traveled as a couple",Location: 5 of 5 stars; Rooms: 5 of 5 stars; S...,"Burlington, Canada",Senior Contributor,San Vigilio,Hotel,Bar / Lounge; Beverage Selection; Fitness Cent...,$$$$,,49.0,Our name says it all.Here on the peaceful outs...,
3,"Reviewed July 27, 2014",C5976008DA681B0AA75B4DE8B73DC5A0,Hotel_Excelsior-San_Vigilio_Marebbe_South_Tyro...,5 of 5 stars,“It is all true!”,Not sure we can add anything to the accurate r...,"Stayed July 2014, traveled as a couple",Rooms: 5 of 5 stars; Cleanliness: 5 of 5 stars...,"Manchester, United Kingdom",Senior Reviewer,San Vigilio,Hotel,Bar / Lounge; Beverage Selection; Fitness Cent...,$$$$,,49.0,Our name says it all.Here on the peaceful outs...,
4,"Reviewed July 23, 2014",033BB72F0C8D163CDB816027D55E16CC,Hotel_Excelsior-San_Vigilio_Marebbe_South_Tyro...,5 of 5 stars,“Beyond our expectations”,Everything was a high level surprise: from the...,"Stayed July 2014, traveled with family",Sleep Quality: 5 of 5 stars; Rooms: 5 of 5 sta...,"Milan, Italy",Contributor,San Vigilio,Hotel,Bar / Lounge; Beverage Selection; Fitness Cent...,$$$$,,49.0,Our name says it all.Here on the peaceful outs...,


In [4]:
# После изучения данных сразу удаляем то, что не понадобится
df.drop(columns=['c1', 'c6', 'c7', 'c8', 'c9', 'c10', 'c13', 'c14', 'c15', 
                 'c16', 'c17'], inplace=True)
df.drop(index=df.index[df['c12'] == 'Activity'], inplace=True)

In [5]:
# Выделяем названия объектов
df['name']= df['c3'].apply(lambda x: x.split('-')[0].replace('_', ' '))
df.drop(columns=['c3'], inplace=True)

In [6]:
# Преобразуем рейтинги в числа
df['rating'] = df['c4'].apply(lambda x: int(x.split()[0]))
df.drop(columns=['c4'], inplace=True)

In [7]:
# Удаляем регионы с <5 отзывов
dft = df.groupby('c11', as_index=False).size()
df = pd.merge(df, dft, on='c11', how='left')
df.drop(index=df.index[df['size'] < 5], inplace=True)
df.drop(columns=['size'], inplace=True)

In [8]:
# Удаляем отзывы пользователей, которые писали только об отелях
dft = df.groupby('c2', as_index=False).size()
df = pd.merge(df, dft, on='c2', how='left')
df.drop(index=df.index[(df['size'] < 2) & (df['c12'] == 'Hotel')], inplace=True)

def filter_hotels(user):
    mask1 = df['c2'] == user
    mask2 = df['c12'] != 'Hotel'
    if df[mask1 & mask2].shape[0] > 0:
        return True
    else:
        return False
df['leave'] = df['c2'].apply(filter_hotels)
df.drop(index=df.index[df['leave'] == False], inplace=True)
df.drop(columns=['size', 'leave'], inplace=True)

In [9]:
# Удаляем записи с пустотами в характеристиках ресторана/достопримечательности
df.drop(index=df.index[df['c18'].isna() & (df['c12'] != 'Hotel')], inplace=True)

In [10]:
# Переименовываем колонки
df = df.rename(columns={'c2': 'user_id', 'c5': 'text', 'c11': 'location',
                         'c12': 'type'}) 

In [11]:
# Собираем информацию о том, сколько отзывов оставили гости отелей о ресторанах \
    # и достопримечательностях
def hotel_users(hotel):
    mask1 = df['name'] == hotel
    mask2 = df['type'] == 'Hotel'
    users = df[mask1 & mask2]['user_id'].unique()
    attractions = sum(list(map(lambda x: df[(df['user_id'] == x) & (df['type'] == 'Attraction')].shape[0], users)))
    restaurants = sum(list(map(lambda x: df[(df['user_id'] == x) & (df['type'] == 'Restaurant')].shape[0], users)))
    return pd.Series([restaurants, attractions])
    
df[['restaurant', 'attraction']] = df['name'].apply(hotel_users)
    

In [12]:
# Оставляем записи только о тех пользователях отелей, которые писали о \
    # ресторанах и достопримечательностях
dfh = df[df['type'] == 'Hotel'].copy()
dfh.drop(columns=['c18'], inplace=True)
dfh.drop(index= dfh.index[(dfh['restaurant'] == 0) | (dfh['attraction'] == 0)], inplace=True)
dfh.drop(columns=['restaurant', 'attraction'], inplace=True)

In [13]:
# Выделяем данные по ресторанам и достопримечательностям
dfr = df[df['type'] != 'Hotel'].copy()
dfr.drop(columns=['restaurant', 'attraction'], inplace=True)

In [14]:
# Выделяем признаки для анализа
new_cols = ['Cuisines', 'Description', 'Dining options', 'Fee',
            'Good for', 'Owner description', 'Price range',
            'Recommended length of visit','Type'
            ]

def one_hot_func(input_str, input_item):
    items = str(input_str).split(';')
    keys = list(map(lambda x: x.split(':')[0].strip(), items))
    values = list(map(lambda x: x.split(':')[1].strip() if len(x.split(':')) > 1 else '', items))
    if input_item in keys:
        ind = keys.index(input_item)
        value = values[ind]
        return value
    else:
        return ''
    
for item in new_cols:
    dfr[item] = dfr['c18'].apply(one_hot_func, input_item=item)


In [15]:
# Дополняем признак Type информацией о ресторанах
mask = dfr['type'] == 'Restaurant'
dfr.loc[mask, 'Type'] = 'Restaurants'

In [16]:
# Формируем новые признаки
categories = set([])

def join_func(input_str, to_set):
    items = str(input_str).split(',')
    to_set.update(items)
    
for item in dfr['Type'].tolist():
    join_func(item, categories)
    
cat_list = list(map(lambda x: x.strip(), categories))
categories = list(set(cat_list))

def one_hot(input_str, input_item):
    if input_item in input_str:
        return 1
    else:
        return 0
    
for item in categories:
    dfr[item] = dfr['Type'].apply(one_hot, input_item=item)

In [17]:
# Удаляем ненужные колонки
dfr.drop(columns=['c18', 'Cuisines', 'Description', 'Dining options', 'Fee',
                  'Good for', 'Owner description', 'Price range',
                  'Recommended length of visit', 'Type'], inplace=True)

In [18]:
# Количество уникальных объектов
dfr['name'].unique().size

558

In [19]:
# Собираем признаки уникальных объектов
unique_names = dfr['name'].unique()
data_dict = {'name': unique_names}

# Рейтинг
def get_rating(name):
    mask = dfr['name'] == name
    return round(dfr.loc[mask, 'rating'].mean(), 2)
# Место
def get_location(name):
    mask = dfr['name'] == name
    return dfr.loc[mask, 'location'].mode()[0]
# Колонки с true/false признаками
def cumulative_column(name, col):
    mask = dfr['name'] == name
    if dfr.loc[mask, col].sum() > 0:
        return 1
    else:
        return 0
# Списки отзывов    
def group_reviews(name):
    mask = dfr['name'] == name
    return dfr.loc[mask, 'text'].tolist()


data_dict['rating'] = list(map(get_rating, unique_names))
data_dict['location'] = list(map(get_location, unique_names))
data = pd.DataFrame(data_dict)   
for cat in categories:
    data[cat] = data['name'].apply(cumulative_column, col=cat)   
data['reviews'] = data['name'].apply(group_reviews)

df_dict = data.to_dict('list')

In [20]:
# Формируем список объектов (через словарь, так быстрее работает)
object_ids = df_dict['name']
ind_o = range(len(object_ids))
object_ids_dict=dict(zip(object_ids, ind_o))

In [21]:
# Формируем список пользователей (через словарь, так быстрее работает)
user_ids = dfr['user_id'].unique().tolist()
ind_u = range(len(user_ids))
user_ids_dict = dict(zip(user_ids, ind_u))

In [22]:
# Формируем матрицу взаимодействий и заполняем её рейтингами
interaction_matrix = csr_matrix((len(user_ids), len(object_ids)),  
                          dtype = np.int8).toarray() 

def record_interactions(user_id):
    mat_row_ind = user_ids_dict[user_id]
    mask = dfr['user_id'] == user_id
    tmp_df = dfr.loc[mask, ['name', 'rating']]
    for name in tmp_df['name'].tolist():
        mat_col_ind = object_ids_dict[name]
        mask2 = dfr['name'] == name
        mat_value = dfr[mask & mask2]['rating'].values[0]
        interaction_matrix[mat_row_ind, mat_col_ind] = mat_value
        
for usr in user_ids:
    record_interactions(usr)
    

In [23]:
# Собираем данные о пользователях, которые писали отзывы о каждой гостинице
hotel_dict = {}
def collect_hotels(name):
    mask = dfh['name'] == name
    return dfh.loc[mask, 'user_id'].tolist()

for hotel in dfh['name'].tolist():
    hotel_dict[hotel] = collect_hotels(hotel)


In [24]:
# Собираем все данные рекомендательной системе в одном словаре
rec = {}
rec['user_ids_dict'] = user_ids_dict
rec['object_ids_dict'] = object_ids_dict
rec['interaction_matrix'] = interaction_matrix
rec['object_metadata'] = df_dict
rec['hotels_users'] = hotel_dict

In [25]:
# Сохраняем данные рекомендательной системы
with open('../front/recommender.pkl', 'wb') as f:
    pickle.dump(rec, f)