In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [260]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
users = pd.read_csv('./data/users.csv')
reviews = pd.read_csv('./data/reviews.csv', low_memory=False)
aspects = pd.read_csv('./data/aspects.csv', low_memory=False)
features = pd.read_csv('./data/features.csv', low_memory=False)
orgs = pd.read_csv('./data/organisations.csv', low_memory=False)
rubrics = pd.read_csv('./data/rubrics.csv', low_memory=False)

* reviews - В этом файле дана информация об отзывах и оценках, оставленных некоторым множеством жителей Москвы и Санкт-Петерубрга в течение обучающего периода
* organisations - Информация об организациях
* users - Информация о городе проживания пользователя
* aspects - Описание извлекаемых из отзывов аспектов. Множество аспектов извлекается из отзыва с помощью NLP-алгоритма и может быть неточным.
* features - Описание особенностей организаций. Как правило, множество особенностей организации заполняется ее владельцем и может быть неточным.
* rubrics - Описание рубрик организаций
* test_users - Множество пользователей, для которых необходимо сделать предсказание

In [3]:
to_list = lambda rubrics: [int(rubric) for rubric in str(rubrics).split(' ')]
def apply_to_columns(df, columns, func=to_list):
    for column in columns:
        df.loc[~df[column].isnull(), column] = df.loc[~df[column].isnull(), column].apply(func)

In [4]:
cols = ['rubrics_id', 'features_id']
apply_to_columns(orgs, cols)
orgs.head()

Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id
0,16848414477362211020,spb,1000.0,4.479702,"[30776, 31375]","[1018, 1509, 11177, 11617, 11629, 11704, 11867..."
1,1430604733320164116,spb,1000.0,4.514509,"[30776, 30770]","[246, 1018, 11617, 11629, 11704, 11867, 20422,..."
2,9880309324224147401,spb,1000.0,3.884615,"[30770, 30774]","[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
3,5617879987171966456,spb,1000.0,,"[30774, 30775]","[1018, 1509, 10596, 11177, 11629, 11634, 11704..."
4,5241461680470612149,spb,1000.0,4.532468,[30776],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."


In [5]:
reviews = reviews.merge(users, on='user_id')
reviews = reviews.rename({'city': 'user_city'}, axis=1)
reviews = reviews.merge(orgs[['org_id', 'city']], on='org_id')
reviews = reviews.rename({'city': 'org_city'}, axis=1)
reviews

Unnamed: 0,user_id,org_id,rating,ts,aspects,user_city,org_city
0,16998268288908323644,7184895086928047809,2.0,105,,msk,msk
1,3121447338909258868,7184895086928047809,5.0,464,,msk,msk
2,1970649778250883025,7184895086928047809,3.0,789,,msk,msk
3,7554889464530643866,7184895086928047809,4.0,936,,msk,msk
4,15907910894057053620,7184895086928047809,1.0,1143,,msk,msk
...,...,...,...,...,...,...,...
3640830,16504916268155591133,11379950099553543298,1.0,1138,,spb,spb
3640831,6729633349339708345,4127027708972853576,5.0,984,,msk,msk
3640832,12811636719149152603,1870939193149876281,5.0,389,,msk,msk
3640833,16479336894539955366,9457835296761142609,5.0,1068,,msk,msk


In [6]:
columns = ['aspects']
apply_to_columns(reviews, columns)

In [7]:
reviews = reviews.merge(orgs, on='org_id', how='left')\
    .rename(columns={'rating_x': 'rating', 'rating_y':'avg_rating'})\
    .drop(['city'], axis=1)

## Заполним средний рейтинг заведения с помощью оценок поставленными пользователями

## Топ 20 по рейтингу

In [394]:
new_rev = reviews[reviews['ts'] >= 1217 - 321]
#new_rev = reviews

In [395]:
# набор отзывов только от туристов
tourist_reviews = new_rev[new_rev['user_city'] != new_rev['org_city']]

In [396]:
def new_rating(row):
    rating = tourist_reviews[tourist_reviews['org_id'] == row['org_id']]['rating'].mean()
    return rating

In [397]:
new_ratings = pd.DataFrame()
new_ratings['org_id'] = tourist_reviews['org_id'].unique()
new_ratings['new_rat'] = new_ratings.progress_apply(new_rating, axis=1)

100%|█████████████████████████████████████████████████████████████████████████████| 9456/9456 [00:06<00:00, 1533.36it/s]


In [398]:
new_ratings = new_ratings.merge(orgs, on='org_id', how='left')

In [399]:
new_ratings

Unnamed: 0,org_id,new_rat,city,average_bill,rating,rubrics_id,features_id
0,7184895086928047809,4.0,msk,500.0,4.038688,[30771],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
1,3586531106767589885,4.0,msk,500.0,4.015789,[30771],"[1018, 11177, 11617, 11704, 11867, 20422, 2734..."
2,17298320470833172098,3.6,msk,500.0,4.201081,"[30774, 30771]","[1018, 11177, 11629, 11704, 11867, 20422, 2734..."
3,7724108979321350255,4.0,msk,,4.314879,[30771],
4,11993738663105455885,5.0,spb,,4.078873,"[30774, 30776]","[246, 1018, 1509, 11177, 11617, 11629, 11704, ..."
...,...,...,...,...,...,...,...
9451,10348333762840611082,5.0,msk,,5.000000,[30774],"[1018, 11617, 3501744275]"
9452,7922808237733169753,5.0,msk,,5.000000,[30519],
9453,17547747826502012687,5.0,spb,,,[30519],
9454,5066641126417821568,5.0,msk,500.0,4.000000,[3501750896],"[11741, 11839, 20422, 3501744275, 3501754799, ..."


In [400]:
msk_orgs = new_ratings[new_ratings['city'] == 'msk']
msk_orgs = msk_orgs.sort_values(by=['new_rat'], ascending=False)['org_id'][:20].to_list()

spb_orgs = new_ratings[new_ratings['city'] == 'spb']
spb_orgs = spb_orgs.sort_values(by=['new_rat'], ascending=False)['org_id'][:20].to_list()

In [401]:
msk_orgs = str(' '.join(map(str, msk_orgs)))
spb_orgs = str(' '.join(map(str, spb_orgs)))

test_users = pd.read_csv('data/test_users.csv')
test_users['city'] = test_users.merge(users, on='user_id')['city']

choose = lambda x: spb_orgs if x['city'] == 'msk' else msk_orgs
target = test_users.apply(choose, axis=1)

predictions = test_users[['user_id']]
predictions['target'] = target

predictions

Unnamed: 0,user_id,target
0,3545210947248911048,11993738663105455885 6850408702453191431 16276...
1,15271987121288045390,11993738663105455885 6850408702453191431 16276...
2,15016858616184265932,11993738663105455885 6850408702453191431 16276...
3,12457244142928722989,11993738663105455885 6850408702453191431 16276...
4,13339684649926251468,7654933538845556915 9426893302961976671 939632...
...,...,...
16962,1191875913294598364,11993738663105455885 6850408702453191431 16276...
16963,3866507700167344338,11993738663105455885 6850408702453191431 16276...
16964,11434952144484188987,11993738663105455885 6850408702453191431 16276...
16965,7010426792722803474,7654933538845556915 9426893302961976671 939632...


In [402]:
predictions.to_csv('top_rating.csv', index=False)

#### 321 дней - оценка 0.01
#### все 1217 дней - оценка 0.01
#### Предлагать заведения по рейтингу не лучшая идея

## Топ 20 популярных

In [281]:
# лучший результат ts_rev[(ts_rev['rating'] > 3.5) & (ts_rev['avg_rating'] >= 3.5)]
# ts_rev = reviews[reviews['ts'] >= 1217 - 321]
ts_rev = reviews[reviews['ts'] >= 1217 - 321]
valid_reviews = ts_rev[(ts_rev['rating'] > 4.0) & (ts_rev['avg_rating'] > 3.3)]

In [282]:
tourist_reviews = valid_reviews[valid_reviews['user_city'] != valid_reviews['org_city']]

In [283]:
tourist_reviews

Unnamed: 0,user_id,org_id,rating,ts,aspects,user_city,org_city,average_bill,avg_rating,rubrics_id,features_id
955,5107236457896918760,17298320470833172098,5.0,994,,spb,msk,500.0,4.201081,"[30774, 30771]","[1018, 11177, 11629, 11704, 11867, 20422, 2734..."
2956,17294933385885458939,11993738663105455885,5.0,966,,msk,spb,,4.078873,"[30774, 30776]","[246, 1018, 1509, 11177, 11617, 11629, 11704, ..."
3130,15496676827602359704,13351482607452884539,5.0,982,,msk,spb,1500.0,4.469604,"[30774, 30776]","[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
3133,1888400425248850783,13351482607452884539,5.0,1053,,msk,spb,1500.0,4.469604,"[30774, 30776]","[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
3134,888845741213298786,13351482607452884539,5.0,945,[247],msk,spb,1500.0,4.469604,"[30774, 30776]","[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
...,...,...,...,...,...,...,...,...,...,...,...
3640400,12892510069730104362,6265546713202831245,5.0,1035,,spb,msk,1000.0,4.636364,"[30774, 31401, 30776]","[1018, 11629, 11704, 11867, 20422, 273469383]"
3640434,5918627482974993111,5866496663401226334,5.0,944,,spb,msk,,5.000000,[30774],
3640445,9132217621649323137,10348333762840611082,5.0,1091,,spb,msk,,5.000000,[30774],"[1018, 11617, 3501744275]"
3640481,1333019719573909064,7922808237733169753,5.0,1182,,spb,msk,,5.000000,[30519],


In [284]:
# выбираем самые популярные места среди туристов из Москвы и Питера
msk_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'msk']['org_id']
msk_orgs_popul = msk_orgs_popul.value_counts().index.to_list()

spb_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'spb']['org_id']
spb_orgs_popul = spb_orgs_popul.value_counts().index.to_list()

In [285]:
def get_recommendation_by_popul(row):
    orgs = list()
    if row['city'] == 'msk':
        for i in spb_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if (i not in orgs) & (tourist_reviews[(tourist_reviews['user_id'] == row['user_id']) & 
                                                  (tourist_reviews['org_id'] == i)].empty):
                orgs.append(i)
    else:
        for i in msk_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if (i not in orgs) & (tourist_reviews[tourist_reviews['user_id'] == row['user_id']& 
                                                  (tourist_reviews['org_id'] == i)].empty):
                orgs.append(i)
    return str(' '.join(map(str, orgs)))

In [286]:
#msk_orgs_popul = str(' '.join(map(str, msk_orgs_popul)))
#spb_orgs_popul = str(' '.join(map(str, spb_orgs_popul)))

test_users = pd.read_csv('data/test_users.csv')
test_users['city'] = test_users.merge(users, on='user_id')['city']
test_users['target'] = test_users.parallel_apply(get_recommendation_by_popul, axis=1)

predictions = test_users[['user_id', 'target']]

predictions

Unnamed: 0,user_id,target
0,3545210947248911048,12046097390037935713 2070377783033138991 68382...
1,15271987121288045390,12046097390037935713 2070377783033138991 68382...
2,15016858616184265932,12046097390037935713 2070377783033138991 68382...
3,12457244142928722989,12046097390037935713 2070377783033138991 68382...
4,13339684649926251468,15250345250621165867 15684663803879321952 9104...
...,...,...
16962,1191875913294598364,12046097390037935713 2070377783033138991 68382...
16963,3866507700167344338,12046097390037935713 2070377783033138991 68382...
16964,11434952144484188987,12046097390037935713 2070377783033138991 68382...
16965,7010426792722803474,15250345250621165867 15684663803879321952 9104...


In [287]:
predictions.to_csv('top_popul.csv', index=False)

## 1-19, 20-21 = 5.19
## 1-15, 20-25 = 5.16
## 1-10, 15-25 = 5.15
## 1-5, 10-25 = 4.86


## Топ рубрик по популярности

In [411]:
ts_rev = reviews[reviews['ts'] > 1217 - 1217]
valid_reviews = ts_rev[(ts_rev['rating'] > 3.5) & (ts_rev['avg_rating'] > 3.5)]

In [412]:
valid_reviews.head()

Unnamed: 0,user_id,org_id,rating,ts,aspects,user_city,org_city,average_bill,avg_rating,rubrics_id,features_id
1,3121447338909258868,7184895086928047809,5.0,464,,msk,msk,500.0,4.038688,[30771],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
3,7554889464530643866,7184895086928047809,4.0,936,,msk,msk,500.0,4.038688,[30771],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
6,6883287208417294150,7184895086928047809,5.0,671,,msk,msk,500.0,4.038688,[30771],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
7,1346687288170684286,7184895086928047809,5.0,311,,msk,msk,500.0,4.038688,[30771],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
8,6798903121839364544,7184895086928047809,4.0,562,,msk,msk,500.0,4.038688,[30771],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."


In [413]:
tourist_reviews = valid_reviews[valid_reviews['user_city'] != valid_reviews['org_city']]

In [162]:
def extract_top_by_rubrics(reviews, N):
    '''
    extract_top_by_rubrics(reviews, N)
        Набирает самые популярные организации по рубрикам, сохраняя распределение.
        
        Parameters
        ----------
        reviews : pd.DataFrame
            Отзывы пользователей для рекомендации.
            
        N : int
            Число рекомендаций.
        
        Returns
        -------
        orgs_list : list
            Список отобранных организаций.
    '''
    
    # извлечение популярных рубрик
    #reviews = reviews.merge(orgs, on='org_id')[['org_id', 'rubrics_id']]
    
    rubrics = reviews.explode('rubrics_id').groupby('rubrics_id').size()
    rubrics = (rubrics / rubrics.sum() * N).apply(round).sort_values(ascending=False)

    # вывод списка рубрик по убыванию популярности
#     print(
#         pd.read_csv('data/rubrics.csv')
#         .merge(rubrics.reset_index(), left_index=True, right_on='rubrics_id')
#         .sort_values(by=0, ascending=False)[['rubric_id', 0]]
#     )
    
    # извлечение популярных организаций
    train_orgs = reviews.groupby('org_id').size().reset_index(name='count').merge(orgs, on='org_id')
    train_orgs = train_orgs[['org_id', 'count', 'rubrics_id']]

    most_popular_rubric = lambda rubrics_id: max(rubrics_id, key=lambda rubric_id: rubrics[rubric_id])
    train_orgs['rubrics_id'] = train_orgs['rubrics_id'].apply(most_popular_rubric)
    
    orgs_by_rubrics = train_orgs.sort_values(by='count', ascending=False).groupby('rubrics_id')['org_id'].apply(list)
    
    # соберём самые популярные организации в рубриках в один список
    
    orgs_list = []

    for rubric_id, count in zip(rubrics.index, rubrics):
        if rubric_id not in orgs_by_rubrics:
            continue

        orgs_list.extend(orgs_by_rubrics[rubric_id][:count])
    
    return orgs_list

In [163]:
msk_rubrics = extract_top_by_rubrics(valid_reviews[valid_reviews['user_city'] == 'msk'], 20)
spb_rubrics = extract_top_by_rubrics(valid_reviews[valid_reviews['user_city'] == 'spb'], 20)

In [164]:
test = pd.read_csv('data/test_users.csv')
test['city'] = test.merge(users, on='user_id')['city']

In [165]:
msk_rubrics = str(' '.join(map(str, msk_rubrics)))
spb_rubrics = str(' '.join(map(str, spb_rubrics)))

test_users = pd.read_csv('data/test_users.csv')
test_users['city'] = test_users.merge(users, on='user_id')['city']

choose = lambda x: spb_rubrics if x['city'] == 'msk' else msk_rubrics
target = test_users.apply(choose, axis=1)

predictions = test_users[['user_id']]
predictions['target'] = target

predictions.head(5)

Unnamed: 0,user_id,target
0,3545210947248911048,12046097390037935713 5710441047385192800 29567...
1,15271987121288045390,12046097390037935713 5710441047385192800 29567...
2,15016858616184265932,12046097390037935713 5710441047385192800 29567...
3,12457244142928722989,12046097390037935713 5710441047385192800 29567...
4,13339684649926251468,15250345250621165867 1703593138705417941 12046...


In [166]:
predictions.to_csv('top_rubrics.csv', index=False)

## Популярные заведения по рубрикам у туристов

In [8]:
reviews_rub_exp = reviews.explode('rubrics_id').reset_index(drop=True)

In [9]:
reviews_rub_exp.head()

Unnamed: 0,user_id,org_id,rating,ts,aspects,user_city,org_city,average_bill,avg_rating,rubrics_id,features_id
0,16998268288908323644,7184895086928047809,2.0,105,,msk,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
1,3121447338909258868,7184895086928047809,5.0,464,,msk,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
2,1970649778250883025,7184895086928047809,3.0,789,,msk,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
3,7554889464530643866,7184895086928047809,4.0,936,,msk,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
4,15907910894057053620,7184895086928047809,1.0,1143,,msk,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."


In [10]:
tourist_reviews = reviews_rub_exp[(reviews_rub_exp['org_city'] != reviews_rub_exp['user_city']) & 
                                  (reviews_rub_exp['rating'] > 3.5)]

In [11]:
tourist_reviews.head()

Unnamed: 0,user_id,org_id,rating,ts,aspects,user_city,org_city,average_bill,avg_rating,rubrics_id,features_id
337,6777707737383161518,7184895086928047809,4.0,1064,,spb,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
408,16897840124443804835,7184895086928047809,5.0,238,,spb,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
481,6643037495508673176,7184895086928047809,4.0,643,,spb,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
595,15204947921117401344,7184895086928047809,5.0,450,,spb,msk,500.0,4.038688,30771,"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
649,5220998832496570562,3586531106767589885,4.0,1206,,spb,msk,500.0,4.015789,30771,"[1018, 11177, 11617, 11704, 11867, 20422, 2734..."


In [12]:
def popul_orgs_by_rubric(row, **kwargs):
    org = tourist_reviews[(tourist_reviews['rubrics_id'] == row['rubric_id']) & 
            (tourist_reviews['org_city'] == kwargs['city'])]['org_id'].value_counts()
    if org.empty:
        return 0
    else:
        return org.index[0]

In [13]:
# популярные у питербуржцев московские заведения по рубрикам
rubrics['msk_popul'] = rubrics.progress_apply(popul_orgs_by_rubric, city='msk', axis=1)
# популярные у москвичей питерские заведения по рубрикам
rubrics['spb_popul'] = rubrics.progress_apply(popul_orgs_by_rubric, city='spb', axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 32.26it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 32.97it/s]


In [14]:
rubrics

Unnamed: 0,rubric_id,rubric_name,msk_popul,spb_popul
0,30519,"Булочная, пекарня",1234262776875653988,8216081591569833367
1,30770,"Бар, паб",1852569111777211913,18082340557576449859
2,30771,Быстрое питание,1121107137659123731,11418579788008867475
3,30774,Кафе,15250345250621165867,2070377783033138991
4,30775,Пиццерия,2570579361797950626,2364294527108415395
5,30776,Ресторан,15250345250621165867,12046097390037935713
6,30777,Столовая,6031819690910058773,3136373007305862292
7,31286,Спортбар,12767429980258144066,12061949658169072132
8,31350,Кондитерская,480638827201745698,14814427257061788801
9,31375,Суши-бар,12995304735213142696,826906323277325942


### Узнать любимую рубрику пользователя -> 
### Добавить заведение ->
### Оставшиеся 19 позици заполнить популярными

#### Любимая рубрика по не родному городу

In [91]:
test_users = pd.read_csv('data/test_users.csv')
test_users['city'] = test_users.merge(users, on='user_id')['city']

In [92]:
tourist_reviews = reviews_rub_exp[((reviews_rub_exp['org_city'] != reviews_rub_exp['user_city']) & 
                                  (reviews_rub_exp['rating'] > 3.5)) & (reviews_rub_exp['avg_rating'] > 3.5)]
#tourist_reviews = tourist_reviews[tourist_reviews['ts'] > 1217 - 321]

In [93]:
def fav_rubric(row):
    rubric = tourist_reviews[tourist_reviews['user_id'] == row['user_id']]['rubrics_id'].value_counts()
    if rubric.empty:
        return None
    else:
        return list(rubric.index[:3])

In [94]:
test_users['fav_rubric_tourist'] = test_users.progress_apply(fav_rubric, axis=1)

100%|████████████████████████████████████████████████████████████████████████████| 16967/16967 [00:16<00:00, 999.00it/s]


In [95]:
test_users['fav_rubric_tourist'].isna().value_counts()

True     12175
False     4792
Name: fav_rubric_tourist, dtype: int64

In [96]:
test_users

Unnamed: 0,user_id,city,fav_rubric_tourist
0,3545210947248911048,msk,
1,15271987121288045390,msk,
2,15016858616184265932,msk,
3,12457244142928722989,msk,
4,13339684649926251468,spb,"[30776, 30774, 30519]"
...,...,...,...
16962,1191875913294598364,msk,
16963,3866507700167344338,msk,
16964,11434952144484188987,msk,
16965,7010426792722803474,spb,


#### Составление рекомендации

In [156]:
ts_rev = reviews[reviews['ts'] > 1217 - 321]
valid_reviews = ts_rev[(ts_rev['rating'] > 3.5) & (ts_rev['avg_rating'] > 3.5)]

In [157]:
tourist_reviews = valid_reviews[valid_reviews['user_city'] != valid_reviews['org_city']]

In [158]:
msk_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'msk']['org_id']
msk_orgs_popul = msk_orgs_popul.value_counts().index[:30].to_list()

spb_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'spb']['org_id']
spb_orgs_popul = spb_orgs_popul.value_counts().index[:30].to_list()

#msk_orgs_popul = list(map(str, msk_orgs_popul))
#spb_orgs_popul = list(map(str, spb_orgs_popul))

In [159]:
test_users

Unnamed: 0,user_id,city,fav_rubric_tourist,target
0,3545210947248911048,msk,,12046097390037935713 6838233943148091808 20703...
1,15271987121288045390,msk,,12046097390037935713 6838233943148091808 20703...
2,15016858616184265932,msk,,12046097390037935713 6838233943148091808 20703...
3,12457244142928722989,msk,,12046097390037935713 6838233943148091808 20703...
4,13339684649926251468,spb,"[30776, 30774, 30519]",15250345250621165867 1234262776875653988 15250...
...,...,...,...,...
16962,1191875913294598364,msk,,12046097390037935713 6838233943148091808 20703...
16963,3866507700167344338,msk,,12046097390037935713 6838233943148091808 20703...
16964,11434952144484188987,msk,,12046097390037935713 6838233943148091808 20703...
16965,7010426792722803474,spb,,15250345250621165867 15684663803879321952 9104...


In [160]:
rubrics

Unnamed: 0,rubric_id,rubric_name,msk_popul,spb_popul
0,30519,"Булочная, пекарня",1234262776875653988,8216081591569833367
1,30770,"Бар, паб",1852569111777211913,18082340557576449859
2,30771,Быстрое питание,1121107137659123731,11418579788008867475
3,30774,Кафе,15250345250621165867,2070377783033138991
4,30775,Пиццерия,2570579361797950626,2364294527108415395
5,30776,Ресторан,15250345250621165867,12046097390037935713
6,30777,Столовая,6031819690910058773,3136373007305862292
7,31286,Спортбар,12767429980258144066,12061949658169072132
8,31350,Кондитерская,480638827201745698,14814427257061788801
9,31375,Суши-бар,12995304735213142696,826906323277325942


In [162]:
def get_recommendation_by_fav_rubric(row):
    orgs = list()
    if row['city'] == 'msk':
        if type(row['fav_rubric_tourist']) != list:
            for i in spb_orgs_popul:
                if len(orgs) == 20:
                    return str(' '.join(map(str, orgs)))
                if (i not in orgs) & (tourist_reviews[tourist_reviews['user_id'] == int(i)].empty):
                    orgs.append(i)
            return str(' '.join(map(str, orgs)))
        for i in row['fav_rubric_tourist']:
            if (rubrics[rubrics['rubric_id'] == i]['spb_popul'].values[0] != 0) & (rubrics[rubrics['rubric_id'] == i]['spb_popul'].values[0] not in orgs):
                orgs.append(rubrics[rubrics['rubric_id'] == i]['spb_popul'].values[0])
        for i in spb_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if (i not in orgs) & (tourist_reviews[tourist_reviews['user_id'] == int(i)].empty):
                orgs.append(i)
    else:
        if type(row['fav_rubric_tourist']) != list:
            for i in msk_orgs_popul:
                if len(orgs) == 20:
                    return str(' '.join(map(str, orgs)))
                if (i not in orgs) & (tourist_reviews[tourist_reviews['user_id'] == int(i)].empty):
                    orgs.append(i)
            return str(' '.join(map(str, orgs)))
        for i in row['fav_rubric_tourist']:
            if (rubrics[rubrics['rubric_id'] == i]['msk_popul'].values[0] != 0) & (rubrics[rubrics['rubric_id'] == i]['msk_popul'].values[0] not in orgs):
                orgs.append(rubrics[rubrics['rubric_id'] == i]['msk_popul'].values[0])
        for i in msk_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if (i not in orgs) & (tourist_reviews[tourist_reviews['user_id'] == int(i)].empty):
                orgs.append(i)
    return str(' '.join(map(str, orgs)))

In [163]:
test_users['target'] = test_users.progress_apply(get_recommendation_by_fav_rubric, axis=1)

100%|████████████████████████████████████████████████████████████████████████████| 16967/16967 [02:16<00:00, 124.49it/s]


In [164]:
test_users

Unnamed: 0,user_id,city,fav_rubric_tourist,target
0,3545210947248911048,msk,,12046097390037935713 6838233943148091808 20703...
1,15271987121288045390,msk,,12046097390037935713 6838233943148091808 20703...
2,15016858616184265932,msk,,12046097390037935713 6838233943148091808 20703...
3,12457244142928722989,msk,,12046097390037935713 6838233943148091808 20703...
4,13339684649926251468,spb,"[30776, 30774, 30519]",15250345250621165867 1234262776875653988 15684...
...,...,...,...,...
16962,1191875913294598364,msk,,12046097390037935713 6838233943148091808 20703...
16963,3866507700167344338,msk,,12046097390037935713 6838233943148091808 20703...
16964,11434952144484188987,msk,,12046097390037935713 6838233943148091808 20703...
16965,7010426792722803474,spb,,15250345250621165867 15684663803879321952 9104...


In [165]:
prediction = test_users[['user_id', 'target']]

In [166]:
prediction

Unnamed: 0,user_id,target
0,3545210947248911048,12046097390037935713 6838233943148091808 20703...
1,15271987121288045390,12046097390037935713 6838233943148091808 20703...
2,15016858616184265932,12046097390037935713 6838233943148091808 20703...
3,12457244142928722989,12046097390037935713 6838233943148091808 20703...
4,13339684649926251468,15250345250621165867 1234262776875653988 15684...
...,...,...
16962,1191875913294598364,12046097390037935713 6838233943148091808 20703...
16963,3866507700167344338,12046097390037935713 6838233943148091808 20703...
16964,11434952144484188987,12046097390037935713 6838233943148091808 20703...
16965,7010426792722803474,15250345250621165867 15684663803879321952 9104...


In [167]:
prediction.to_csv('fav_rubric_plus_popul.csv', index=False)

### Добавить каждому по одному заведению из популярных по рубрике - оценка 5.16
### Добавить только питербуржцам - оценка 5.16
### Добавить только москвичам - оценка 5.19
### Добавить по заведению с трех любимых рубрик - оценка 5.16

## Связь между понравившимися заведениями

In [148]:
test = pd.read_csv('data/test_users.csv')
test['city'] = test.merge(users, on='user_id')['city']

In [22]:
ts_rev = reviews[reviews['ts'] > 1217 - 321]
valid_reviews = ts_rev[(ts_rev['rating'] > 3.5) & (ts_rev['avg_rating'] > 3.5)]

In [149]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [253]:
def get_recommendation_by_relation(row):
    if row['city'] == 'msk':
        
                                                                                         """
    берем список московских организаций которым пользователь ставил оценку 4 или выше"""
        
        msk_orgs = list(valid_reviews.loc[(valid_reviews['user_id'] == row['user_id']) & (valid_reviews['org_city'] == 'msk'), 'org_id'])
        
                                                                                                """
    берем список пользователей из питера которые ставили оценку 4 или выше этим организациям"""

        spb_users = list(valid_reviews.loc[(valid_reviews['org_id'].isin(msk_orgs)) & (valid_reviews['user_city'] == 'spb'), 'user_id'])
       
                                                                                                                          """
    составляем список топ 20 популярных питерских орагнизаций которым пользователи из питера ставили оценку 4 или выше"""
        
        orgs = valid_reviews.loc[(valid_reviews['user_id'].isin(spb_users)) & (valid_reviews['org_city'] == 'spb'), 'org_id'].value_counts().index[:20].to_list()
    else:
        spb_orgs = list(valid_reviews.loc[(valid_reviews['user_id'] == row['user_id']) & (valid_reviews['org_city'] == 'spb'), 'org_id'])
        msk_users = list(valid_reviews.loc[(valid_reviews['org_id'].isin(spb_orgs)) & (valid_reviews['user_city'] == 'msk'), 'user_id'])
        orgs = valid_reviews.loc[(valid_reviews['user_id'].isin(msk_users)) & (valid_reviews['org_city'] == 'msk'), 'org_id'].value_counts().index[:20].to_list()
    return orgs

In [27]:
test['orgs'] = test.parallel_apply(get_recommendation_by_relation, axis=1)

In [28]:
test.to_csv('almost_answer_w_ts.csv', index=False)

In [29]:
test = pd.read_csv('almost_answer_w_ts.csv')

In [31]:
test['orgs'] = test['orgs'].apply(lambda x: None if x[1:-1] == '' else x[1:-1].split(', ')[:1])

In [32]:
test

Unnamed: 0,user_id,city,orgs
0,3545210947248911048,msk,[2116549028631632220]
1,15271987121288045390,msk,[12046097390037935713]
2,15016858616184265932,msk,
3,12457244142928722989,msk,
4,13339684649926251468,spb,[1625971115460696067]
...,...,...,...
16962,1191875913294598364,msk,
16963,3866507700167344338,msk,
16964,11434952144484188987,msk,
16965,7010426792722803474,spb,[16785419493666881395]


In [33]:
msk_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'msk']['org_id']
msk_orgs_popul = msk_orgs_popul.value_counts().index.to_list()

spb_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'spb']['org_id']
spb_orgs_popul = spb_orgs_popul.value_counts().index.to_list()

msk_orgs_popul = list(map(str, msk_orgs_popul))
spb_orgs_popul = list(map(str, spb_orgs_popul))

In [300]:
def fill_empty(row):
    orgs = row['orgs']
    if row['city'] == 'msk':
        if orgs is None:
            return str(' '.join(map(str, spb_orgs_popul[:20])))
        for i in spb_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if i not in orgs:
                orgs.append(i)
    else:
        if orgs is None:
            return str(' '.join(map(str, msk_orgs_popul[:20])))
        for i in msk_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if i not in orgs:
                orgs.append(i)
    return str(' '.join(map(str, orgs)))

In [35]:
test['target'] = test.progress_apply(fill_empty, axis=1)

100%|██████████████████████████████████████████████████████████████████████████| 16967/16967 [00:00<00:00, 36390.67it/s]


In [36]:
test = test[['user_id', 'target']]

In [37]:
test

Unnamed: 0,user_id,target
0,3545210947248911048,2116549028631632220 12046097390037935713 68382...
1,15271987121288045390,12046097390037935713 6838233943148091808 20703...
2,15016858616184265932,12046097390037935713 6838233943148091808 20703...
3,12457244142928722989,12046097390037935713 6838233943148091808 20703...
4,13339684649926251468,1625971115460696067 15250345250621165867 15684...
...,...,...
16962,1191875913294598364,12046097390037935713 6838233943148091808 20703...
16963,3866507700167344338,12046097390037935713 6838233943148091808 20703...
16964,11434952144484188987,12046097390037935713 6838233943148091808 20703...
16965,7010426792722803474,16785419493666881395 15250345250621165867 1568...


In [38]:
test.to_csv('answers.csv', index=False)

### Связь между понравившимися заведениями оценка без ts - 3.02
### Связь между понравившимися заведениями оценка с ts, топ 5 - 4.09
### Связь между понравившимися заведениями оценка с ts, топ 3 - 4.2
### Связь между понравившимися заведениями оценка с ts, топ 1 - 4.45

### С заполнением среднего рейтинга оценка - 4.76
### Без заполнения оценка - 4.81
### Выборка за последние 107 дней - оценка 5.02
### Выборка за последние 214 дней - оценка 5.17
### Выборка за последние 321 дня - оценка 5.18

# Финальное решение

In [517]:
# лучший результат
# ts_rev = reviews[reviews['ts'] >= 1217 - 321]
# valid_reviews = ts_rev[ts_rev['rating'] > 4.0]
ts_rev = reviews[reviews['ts'] > 1217 - 321]
valid_reviews = ts_rev[ts_rev['rating'] > 4.0]

In [518]:
tourist_reviews = valid_reviews[valid_reviews['user_city'] != valid_reviews['org_city']]

In [519]:
df_to_check = reviews[reviews['org_city'] != reviews['user_city']]

In [520]:
tourist_reviews.isna().sum()

user_id             0
org_id              0
rating              0
ts                  0
aspects         28652
user_city           0
org_city            0
average_bill     9988
avg_rating        515
rubrics_id          0
features_id       708
dtype: int64

In [521]:
# выбираем самые популярные места среди туристов из Москвы и Питера
msk_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'msk']['org_id']
msk_orgs_popul = msk_orgs_popul.value_counts().index.to_list()

spb_orgs_popul = tourist_reviews[tourist_reviews['org_city'] == 'spb']['org_id']
spb_orgs_popul = spb_orgs_popul.value_counts().index.to_list()

In [522]:
def get_recommendation_by_popul(row):
    orgs = list()
    if row['city'] == 'msk':
        for i in spb_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if (i not in orgs) & (df_to_check[(df_to_check['user_id'] == row['user_id']) & 
                                                  (df_to_check['org_id'] == i)].empty):
                orgs.append(i)
    else:
        for i in msk_orgs_popul:
            if len(orgs) == 20:
                return str(' '.join(map(str, orgs)))
            if (i not in orgs) & (df_to_check[(df_to_check['user_id'] == row['user_id']) & 
                                                  (df_to_check['org_id'] == i)].empty):
                orgs.append(i)
    return str(' '.join(map(str, orgs)))

In [523]:
#msk_orgs_popul = str(' '.join(map(str, msk_orgs_popul)))
#spb_orgs_popul = str(' '.join(map(str, spb_orgs_popul)))

test_users = pd.read_csv('data/test_users.csv')
test_users['city'] = test_users.merge(users, on='user_id')['city']
test_users['target'] = test_users.parallel_apply(get_recommendation_by_popul, axis=1)

predictions = test_users[['user_id', 'target']]

predictions

Unnamed: 0,user_id,target
0,3545210947248911048,12046097390037935713 2070377783033138991 68382...
1,15271987121288045390,12046097390037935713 2070377783033138991 68382...
2,15016858616184265932,12046097390037935713 2070377783033138991 68382...
3,12457244142928722989,12046097390037935713 2070377783033138991 68382...
4,13339684649926251468,15250345250621165867 15684663803879321952 9104...
...,...,...
16962,1191875913294598364,12046097390037935713 2070377783033138991 68382...
16963,3866507700167344338,12046097390037935713 2070377783033138991 68382...
16964,11434952144484188987,12046097390037935713 2070377783033138991 68382...
16965,7010426792722803474,15250345250621165867 15684663803879321952 9104...


In [524]:
predictions.to_csv('final_submission.csv', index=False)