In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# DATASET

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import logging
import warnings
warnings.filterwarnings("ignore")

In [None]:
logger = logging.getLogger('resyc')

In [None]:
class DataModule():
  def __init__(self):
    # config
    self.FOLDER_PATH = "/content/drive/MyDrive/Recommendation"
    self.RESTAURANT_INFO_PATH = "/content/drive/MyDrive/Recommendation/Restaurant name and related info.csv"
    self.RESTAURANT_META_PATH = "/content/drive/MyDrive/Recommendation/Restaurant_Review.csv"

    self.res_label = LabelEncoder()

  def read_csv(self):
    self.restaurant_info_df = pd.read_csv(self.RESTAURANT_INFO_PATH)
    self.res_review_df = pd.read_csv(self.RESTAURANT_META_PATH)

  def label_encoding(self):
    # Encoding restuarant name as unique id
    self.read_csv()
    self.res_label.fit(list(set(self.restaurant_info_df['Name'].values)) + list(set(self.res_review_df['Restaurant'].values)))
    self.restaurant_info_df['ID'] = self.res_label.transform(self.restaurant_info_df['Name'])
    self.res_review_df['ID'] = self.res_label.transform(self.res_review_df['Restaurant'])
    logger.info(f"Lable encoding of data is completed!")
    return self.processing()

  def processing(self)->pd.DataFrame:

    # replacing index 7601 rating with mean rating 
    self.res_review_df['Rating'].iloc[7601]=0
    self.res_review_df['Rating'].fillna(0,inplace=True)

    # conveting dtype to flaot for feature creating purpose
    self.res_review_df['Rating'] = self.res_review_df['Rating'].astype('float')
    logger.info(f"pre-processing of data is completed!")

    # replacing nan values
    self.res_review_df['Metadata'].fillna('0 Review , 0 Followers',inplace=True)

    return self.restaurant_info_df, self.res_review_df


In [None]:
data_model = DataModule()
df1,df2 = data_model.label_encoding()
print(df1.shape)

(105, 7)


In [None]:
df2.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,ID
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5.0,"1 Review , 2 Followers",5/25/2019 15:54,0,17
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5.0,"3 Reviews , 2 Followers",5/25/2019 14:20,0,17
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5.0,"2 Reviews , 3 Followers",5/24/2019 22:54,0,17
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5.0,"1 Review , 1 Follower",5/24/2019 22:11,0,17
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5.0,"3 Reviews , 2 Followers",5/24/2019 21:37,0,17


### 1. Feature Store

In [None]:
# getting sentiment of the review using textblob
!pip install -q textblob

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from textblob import TextBlob

In [None]:
class FeatureStore:
  def __init__(self):
    pass

  def feature_eng(self):

    # Number of review for a restuart
    df_temp = df2.groupby('Restaurant')["Reviewer"].count().reset_index(name='Number_of_review')
    
    # Average restaurant rating
    df_temp_mean = df2.groupby('Restaurant')["Rating"].mean().reset_index(name='Avg_rating')


  def get_sentiment(self,row):
    blob = TextBlob(str(row))
    return round(sum([sentence.sentiment.polarity for sentence in blob.sentences])/len(blob.sentences),2) 

#### 1. Number of review per restaurant

In [None]:
feature_model = FeatureStore()

df2['sentiment']=df2['Review'].apply(lambda x: feature_model.get_sentiment(x))

In [None]:
df2.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,ID,sentiment
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5.0,"1 Review , 2 Followers",5/25/2019 15:54,0,17,0.52
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5.0,"3 Reviews , 2 Followers",5/25/2019 14:20,0,17,0.46
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5.0,"2 Reviews , 3 Followers",5/24/2019 22:54,0,17,0.39
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5.0,"1 Review , 1 Follower",5/24/2019 22:11,0,17,0.57
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5.0,"3 Reviews , 2 Followers",5/24/2019 21:37,0,17,0.38


### 2. Review ranking Module

In [None]:
def review_ranking(df:pd.DataFrame) ->pd.DataFrame:
  """ Ranking the review for restaurant based on reviewer weighted score """

  review_weight_list = []
  for idx, _ in enumerate(df2['Restaurant']):
    restaurant_rating = df2['Rating'].iloc[idx]
    meta_data_restaurant = df2['Metadata'].iloc[idx]
    review_sentiment = float(df2['sentiment'].iloc[idx]) 

    try:
      if "," in meta_data_restaurant: 
        reviewer_number_of_review,reviewer_number_of_follower = meta_data_restaurant.split(",")[0].strip().split(" ")[0], \
                                                                meta_data_restaurant.split(",")[1].strip().split(" ")[0]
        
      else:
        if "Review" in meta_data_restaurant:
          reviewer_number_of_review = float(meta_data_restaurant.split(" ")[0])
          reviewer_number_of_follower = 1
        else:
          reviewer_number_of_review = 1
          reviewer_number_of_follower = float(meta_data_restaurant.split(" ")[0])
    except Exception as e:
      print(e)
      break;

    review_weight = (float(restaurant_rating)+float(reviewer_number_of_review)+float(reviewer_number_of_follower)+float(review_sentiment))/len(df2[df2['Restaurant']==_])
    review_weight_list.append(review_weight)
  
  df2['Review_weight'] = review_weight_list

  return df2


In [None]:

def utils(df:pd.DataFrame) -> dict:
  # creating a avg rating per restaurant mapping dict

  score_dict = {}
  for id,res in enumerate(df2['ID']):
    score_dict[res] = sum(df2[df2['ID']==res]['Rating'])/len(df2[df2['ID']==res])

  return score_dict


df2['Score'] = df2['ID'].map(utils(df2)) 

## 3. Transformer similarities

In [None]:
!pip install -U -q sentence-transformers

[K     |████████████████████████████████| 85 kB 3.6 MB/s 
[K     |████████████████████████████████| 5.3 MB 69.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 54.9 MB/s 
[K     |████████████████████████████████| 163 kB 70.5 MB/s 
[K     |████████████████████████████████| 7.6 MB 67.7 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sentence_transformers import util

model = SentenceTransformer('paraphrase-distilroberta-base-v1')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
def get_top_restaurant(text: str = "Best hygiene north indian food") ->pd.DataFrame:
  # generating embedding smilarities between query and data
  # generating similarity dataframe

  score_df = pd.DataFrame(columns=['ID','score'])
  for _ in tqdm(range(len(df1))):
    test = model.encode(str(df1['Collections'].iloc[_])+str(df1['Cuisines'].iloc[_]))
    query  = model.encode(text)
    result = util.cos_sim(query, test).cpu().numpy()[0][0]
    score_df = score_df.append({'ID':df1['ID'][_],'score':result},ignore_index=True)

  score_df.sort_values(by='score',ascending=False,inplace=True)
  score_df['Name'] = score_df['ID'].apply(lambda x: df1[df1['ID']==x]['Name'].iloc[0])

  # creating mapping dict of ID:score on inference based on query similarities
  mapping_dict= {}
  for id in score_df['ID']:
    try:
      mapping_dict[id]=df2[df2['ID']==id]['Score'].iloc[0]
    except:
      mapping_dict[id]=0

  score_df['res_avg_rating_score']=score_df['ID'].map(mapping_dict)
  score_df.sort_values(by=['score','res_avg_rating_score'],ascending=['False','False'],inplace=True)


  return pd.DataFrame(score_df['Name'])
 

In [None]:
!pip install git+https://github.com/openai/whisper.git -q

  Building wheel for whisper (setup.py) ... [?25l[?25hdone


In [None]:
import whisper
#To know more about whisper click here -> https://github.com/openai/whisper/blob/main/whisper/

#initializing speech to text model object
model_whisper = whisper.load_model("large")

100%|█████████████████████████████████████| 2.87G/2.87G [00:36<00:00, 84.3MiB/s]


## Input Audio for search

In [None]:
!pip install -q ffmpeg-python

In [None]:
# NOTE: Below code is taken from stackoverflow 
"""
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

## Speech to Text

In [None]:
# converting speech to text
query_text = model_whisper.transcribe("/content/drive/MyDrive/Recommendation/example.wav",)
print(query_text['text'])

 Favorite North Indian food.


## Recommendation

In [None]:
recommended_restaurant_df = get_top_restaurant(str(query_text['text']))
recommended_restaurant_df.head()

  0%|          | 0/105 [00:00<?, ?it/s]

Unnamed: 0,Name
90,Arena Eleven
55,Cafe Eclat
28,Behrouz Biryani
7,Shah Ghouse Spl Shawarma
85,Momos Delight


## Topic modeling 

Extracting key features from restaurant reviews.
<br>
<b>Note:</b> Topic modeling feature need improvement, will be updating to new in next version.

In [None]:
!pip install -q keybert

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

In [None]:
def get_topics(df2):
  restaurant_topics = {}
  for unique_names in set(df2['Restaurant'].values):
    df_test = df2[df2['Restaurant']==unique_names]
    text = " "
    for row in df_test['Review']:
      text+=str(row) 
  
    topics = kw_model.extract_keywords(text.replace("\n",""), keyphrase_ngram_range=(1, 2), stop_words=None,top_n=10)
    topics_str = " "
    for x in topics:
      topics_str+=set(x[0]) +" "
    restaurant_topics[df_test['ID'].iloc[0]]=topics_str

  df2['topics'] = df2['ID'].map(restaurant_topics)

  return df2

In [None]:
df2 = get_topics(df2)

In [None]:
def get_top_restaurant_plus_review(text: str = "Best hygiene north indian food") ->pd.DataFrame:
  # generating embedding smilarities between query and data
  # generating similarity dataframe

  score_df = pd.DataFrame(columns=['ID','score'])
  for _ in tqdm(range(len(df1))):
    try:
      test = model.encode(str(df1['Collections'].iloc[_])+str(df1['Cuisines'].iloc[_]) +str(df2[df2['ID']==df1['ID'].iloc[_]]['topics'].iloc[0].strip()))
    except:
      test = model.encode(str(df1['Collections'].iloc[_])+str(df1['Cuisines'].iloc[_]))

    query  = model.encode(text)
    result = util.cos_sim(query, test).cpu().numpy()[0][0]
    score_df = score_df.append({'ID':df1['ID'][_],'score':result},ignore_index=True)

  score_df.sort_values(by='score',ascending=False,inplace=True)
  score_df['Name'] = score_df['ID'].apply(lambda x: df1[df1['ID']==x]['Name'].iloc[0])

  # creating mapping dict of ID:score on inference based on query similarities
  mapping_dict= {}
  for id in score_df['ID']:
    try:
      mapping_dict[id]=df2[df2['ID']==id]['Score'].iloc[0]
    except:
      mapping_dict[id]=0

  score_df['res_avg_rating_score']=score_df['ID'].map(mapping_dict)
  score_df['weighted_score'] = (score_df['score']+score_df['res_avg_rating_score'])/2
  score_df.sort_values(by='weighted_score',ascending='False',inplace=True)


  return pd.DataFrame(score_df[['Name','ID','weighted_score','res_avg_rating_score','score']])
 

In [None]:
def get_top_restaurant_based_weighted_score(text: str = "Best hygiene north indian food") ->pd.DataFrame:
  # generating embedding smilarities between query and data
  # generating similarity dataframe

  score_df = pd.DataFrame(columns=['ID','score'])
  for _ in tqdm(range(len(df1))):

    query = model.encode(text)

    try:
      collection_embedding = model.encode(str(df1['Collections'].iloc[_])+str(df1['Cuisines'].iloc[_]))
      cuisine_embedding = model.encode(str(df1['Cuisines'].iloc[_]))
      review_topic_embedding = model.encode(str(df2[df2['ID']==df1['ID'].iloc[_]]['topics'].iloc[0].strip()))
      
      query_collections_result = util.cos_sim(query, collection_embedding).cpu().numpy()[0][0]
      query_cuisine_result = util.cos_sim(query, cuisine_embedding).cpu().numpy()[0][0]
      query_topic_result = util.cos_sim(query, review_topic_embedding).cpu().numpy()[0][0]

    except:
      collection_embedding = model.encode(str(df1['Collections'].iloc[_])+str(df1['Cuisines'].iloc[_]))
      cuisine_embedding = model.encode(str(df1['Cuisines'].iloc[_]))
      query_topic_result = 0

  
    score_df = score_df.append({'ID':df1['ID'][_],
                                'score1':query_collections_result,
                                'score2':query_cuisine_result,
                                'score3':query_topic_result},
                               ignore_index=True)

  # score_df.sort_values(by='score',ascending=False,inplace=True)
  score_df['Name'] = score_df['ID'].apply(lambda x: df1[df1['ID']==x]['Name'].iloc[0])

  # creating mapping dict of ID:score on inference based on query similarities
  mapping_dict= {}
  for id in score_df['ID']:
    try:
      mapping_dict[id]=df2[df2['ID']==id]['Score'].iloc[0]
    except:
      mapping_dict[id]=0

  score_df['res_avg_rating_score']=score_df['ID'].map(mapping_dict)
  score_df['weighted_score'] = (score_df['score1'] + score_df['score2'] + \
                          score_df['score3']+score_df['res_avg_rating_score'])/4
  score_df.sort_values(by='weighted_score',ascending='False',inplace=True)

  return pd.DataFrame(score_df[['Name','ID','weighted_score','res_avg_rating_score','score1','score2','score3']])
 

In [None]:
from scipy.io.wavfile import write
from pprint import pprint


# audio input 
audio, sr = get_audio()

# NOTE: One more feature need to be added multi-lang to english conversion 
#      to handle non-english language

samplerate = 44100; fs = 100

audio_write_path = "/content/drive/MyDrive/Recommendation/example2.wav"
write(audio_write_path,samplerate,audio) #saving audio 

# converting speech to text
query_text = model_whisper.transcribe(audio_write_path)
print(query_text['text'])

recommended_restaurant_df = get_top_restaurant_based_weighted_score(query_text['text'])
recommended_restaurant_df.sort_values(by='weighted_score',ascending=False,inplace=True)
print(recommended_restaurant_df.head())


# checking the meta data
pprint(df1[df1['ID']==recommended_restaurant_df['ID'].iloc[0]].to_dict())
pprint(df2[df2['ID']==recommended_restaurant_df['ID'].iloc[0]].iloc[0].to_dict())

  0%|          | 0/105 [00:00<?, ?it/s]

                              Name    ID  weighted_score  \
14       AB's - Absolute Barbecues   3.0        1.468716   
2                         Flechazo  36.0        1.445551   
27  3B's - Buddies, Bar & Barbecue   2.0        1.444952   
16             NorFest - The Dhaba  64.0        1.438583   
1                         Paradise  69.0        1.417285   

    res_avg_rating_score    score1    score2    score3  
14                  4.88  0.414590  0.401389  0.178887  
2                   4.66  0.422141  0.470109  0.229954  
27                  4.76  0.349949  0.423566  0.246296  
16                  4.03  0.608812  0.661637  0.453884  
1                   4.70  0.354172  0.367762  0.247206  
{'Collections': {14: 'Barbecue & Grill, Great Buffets, Corporate Favorites, '
                     "Hyderabad's Hottest"},
 'Cost': {14: '1,500'},
 'Cuisines': {14: 'European, Mediterranean, North Indian'},
 'ID': {14: 3},
 'Links': {14: 'https://www.zomato.com/hyderabad/abs-absolute-barbecues-ga