In [1]:
import MySQLdb
import pandas as pd 
import numpy as np

conn = MySQLdb.connect(host="127.0.0.1", user="root", db="privateaser_dev")

SQL_Query = pd.read_sql_query("SELECT id, nom, code_postal, quartier, description, infos_ambiance, universe, rental_room_category from etablissements WHERE etablissements.status IN ('active', 'active_without_sea', 'scrapped')", conn)
df = pd.DataFrame(SQL_Query, columns=['id', 'nom', 'code_postal', 'quartier', 'description', 'infos_ambiance', 'rental_room_category'])

conn.close()

print(df.head(3))



   id                    nom  code_postal        quartier  \
0   3             L'Antidote        75005  Quartier Latin   
1   6                L'Adada        75014    Montparnasse   
2   8  La Mercerie Oberkampf        75011       Oberkampf   

                                         description  \
0  Localisé dans le [quartier Latin](https://www....   
1  Attention, pas d'arrivée après 19h30 dans cet ...   
2  Situé dans la trépidante [Rue Oberkampf](https...   

                                      infos_ambiance rental_room_category  
0  Rock à l'étage, electro, funk, house et éclect...                 None  
1                   Bonne ambiance, musique générale                 None  
2                                                                    None  


In [15]:
# Create a string about to be analyzed later
def create_soup(x):
    string = " "
    
    if x['quartier'] is not None:
        string += x['quartier']
    
    if x['description'] is not None:
        string += ' ' + x['description']
    
    if x['infos_ambiance'] is not None:
        string += ' ' + x['infos_ambiance']
        
    if x['rental_room_category'] is not None:
        string += ' ' + x['rental_room_category']
    
    return string


df['soup'] = df.apply(create_soup, axis=1)

print(df['soup'].head(3))

0     Quartier Latin Localisé dans le [quartier Lat...
1     Montparnasse Attention, pas d'arrivée après 1...
2     Oberkampf Situé dans la trépidante [Rue Oberk...
Name: soup, dtype: object


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words=["un", "une", "les", "de", "le", "la", "du", "au", "il", "à"])
count_matrix = count.fit_transform(df['soup'])

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [18]:
# List for results
indices = pd.Series(df.index, index=df['id']).drop_duplicates()
print(indices)

id
3            0
6            1
8            2
11           3
15           4
18           5
23           6
26           7
31           8
32           9
38          10
46          11
47          12
48          13
53          14
58          15
61          16
66          17
68          18
70          19
71          20
72          21
73          22
74          23
77          24
78          25
79          26
87          27
89          28
90          29
         ...  
16317    10517
16318    10518
16320    10519
16321    10520
16327    10521
16328    10522
16329    10523
16331    10524
16332    10525
16333    10526
16334    10527
16335    10528
16359    10529
16361    10530
16370    10531
16379    10532
16384    10533
16387    10534
16389    10535
16399    10536
16400    10537
16411    10538
16418    10539
16426    10540
16442    10541
16444    10542
16446    10543
16450    10544
16451    10545
16452    10546
Length: 10547, dtype: int64


In [19]:
def get_recommendations(id, cosine_sim=cosine_sim):
    # Get the index of the venue that matches the id
    idx = indices[id]

    # Get the pairwsie similarity scores of all movies with that venue
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the venues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    venue_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['nom'].iloc[venue_indices]

In [22]:
get_recommendations(252, cosine_sim)

6118          La Revanche - Bistrot ludique
525                               L'Apérock
1515    Sofitel Paris Arc de Triomphe - Bar
723                         L'Armagnac Café
1432                             Le HasBeen
Name: nom, dtype: object