In [1]:
import MySQLdb
import pandas as pd 
import numpy as np

conn = MySQLdb.connect(host="127.0.0.1", user="root", db="privateaser_dev")

SQL_Query = pd.read_sql_query("SELECT id, nom, code_postal, quartier, description, infos_ambiance, universe, rental_room_category from etablissements WHERE etablissements.status IN ('active', 'active_without_sea', 'scrapped')", conn)
df = pd.DataFrame(SQL_Query, columns=['id', 'nom', 'code_postal', 'quartier', 'description', 'infos_ambiance', 'rental_room_category'])

conn.close()

print(df.head(3))



   id                    nom  code_postal        quartier  \
0   3             L'Antidote      75005.0  Quartier Latin   
1   6                L'Adada      75014.0    Montparnasse   
2   8  La Mercerie Oberkampf      75011.0       Oberkampf   

                                         description  \
0  Localisé dans le [quartier Latin](https://www....   
1  Attention, pas d'arrivée après 19h30 dans cet ...   
2  Situé dans la trépidante [Rue Oberkampf](https...   

                                      infos_ambiance rental_room_category  
0  Rock à l'étage, electro, funk, house et éclect...                 None  
1                   Bonne ambiance, musique générale                 None  
2                                                                    None  


In [2]:
# Create a string about to be analyzed later
def create_soup(x):
    string = " "
    
    if x['quartier'] is not None:
        string += x['quartier']
    
    if x['description'] is not None:
        string += ' ' + x['description']
    
    if x['infos_ambiance'] is not None:
        string += ' ' + x['infos_ambiance']
        
    if x['rental_room_category'] is not None:
        string += ' ' + x['rental_room_category']
    
    return string


df['soup'] = df.apply(create_soup, axis=1)

print(df['soup'].head(3))

0     Quartier Latin Localisé dans le [quartier Lat...
1     Montparnasse Attention, pas d'arrivée après 1...
2     Oberkampf Situé dans la trépidante [Rue Oberk...
Name: soup, dtype: object


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words=["un", "une", "les", "de", "le", "la", "du", "au", "il", "à"])
count_matrix = count.fit_transform(df['soup'])

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [9]:
# List for results
indices = pd.Series(df.index, index=df['id']).drop_duplicates()
print(indices)

id
3            0
6            1
8            2
11           3
18           4
         ...  
16650    10360
16658    10361
16663    10362
16666    10363
16679    10364
Length: 10365, dtype: int64


In [10]:
def get_recommendations(id, cosine_sim=cosine_sim):
    # Get the index of the venue that matches the id
    idx = indices[id]

    # Get the pairwsie similarity scores of all movies with that venue
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the venues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indices
    venue_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['nom'].iloc[venue_indices]

In [12]:
get_recommendations(252, cosine_sim)

5892                     Pop Pop Le Bistrot
5811          La Revanche - Bistrot ludique
530                               L'Apérock
1521    Sofitel Paris Arc de Triomphe - Bar
732                         L'Armagnac Café
Name: nom, dtype: object