In [33]:
#libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [34]:
dataset_path = "preprocessed_resturants.csv"
df = pd.read_csv(dataset_path)

In [35]:
columns_to_drop = ['lat', 'lng', 'likes', 'photos', 'tips', 'ratingSignals']
# Drop the specified columns
df = df.drop(columns=columns_to_drop)

# Display the modified DataFrame
df.head()

Unnamed: 0,name,categories,address,price,rating
0,Dunkin',Donut Shop,"الرياض, المملكة العربية السعودية",Cheap,8.9
1,Moroccan Taste (المذاق المغربي),Café,"Makkah Al Mukarramah Rd, الرياض, المملكة العرب...",Cheap,8.6
2,كرك جدي,Café,"الرياض, المملكة العربية السعودية",Cheap,7.3
3,دانكن دونت طريق المزاحميه,"Donut Shop, Coffee Shop","الرياض, المملكة العربية السعودية",Cheap,8.3
4,Starbucks,Coffee Shop,"الرياض 14723, المملكة العربية السعودية",Cheap,8.1


In [36]:
len(df)

7411

In [37]:
# Define a function to check if the string contains Arabic characters
def has_arabic(text):
    arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
    return bool(arabic_pattern.search(text))

# Filter out rows where 'name' contains Arabic characters
df_cleaned = df[df['name'].apply(lambda x: not has_arabic(x))]

# Reset the index
df_cleaned = df_cleaned.reset_index(drop=True)

# Display the cleaned DataFrame
print(df_cleaned)

                                    name            categories  \
0                                Dunkin'            Donut Shop   
1                              Starbucks           Coffee Shop   
2                            Burger King  Fast Food Restaurant   
3                              Starbucks           Coffee Shop   
4                                    Tsc            Restaurant   
...                                  ...                   ...   
2202                      Domino's Pizza           Pizza Place   
2203                         Upper Crust        Sandwich Place   
2204                           Java Time           Coffee Shop   
2205  Upper Crust- KKIA “Riyadh Airport”                  Café   
2206                      MammaRoti cafe                  Café   

                                                address     price  rating  
0                      الرياض, المملكة العربية السعودية     Cheap     8.9  
1                الرياض 14723, المملكة العربية السعودية

In [38]:
#all lowercase
df_cleaned['name'] = df_cleaned['name'].str.lower()

# Remove entire rows with duplicate 'cleaned_name'
df_cleaned = df_cleaned.drop_duplicates(subset=['name'], keep='first')

# Reset the index
df_cleaned = df_cleaned.reset_index(drop=True)

# Display the cleaned DataFrame
print(df_cleaned)

                                    name            categories  \
0                                dunkin'            Donut Shop   
1                              starbucks           Coffee Shop   
2                            burger king  Fast Food Restaurant   
3                                    tsc            Restaurant   
4                         domino's pizza           Pizza Place   
...                                  ...                   ...   
1466                 arabica star coffee           Coffee Shop   
1467              butlers chocolate café                  Café   
1468                         upper crust        Sandwich Place   
1469  upper crust- kkia “riyadh airport”                  Café   
1470                      mammaroti cafe                  Café   

                                                address     price  rating  
0                      الرياض, المملكة العربية السعودية     Cheap     8.9  
1                الرياض 14723, المملكة العربية السعودية

In [39]:
df_cleaned.head()

Unnamed: 0,name,categories,address,price,rating
0,dunkin',Donut Shop,"الرياض, المملكة العربية السعودية",Cheap,8.9
1,starbucks,Coffee Shop,"الرياض 14723, المملكة العربية السعودية",Cheap,8.1
2,burger king,Fast Food Restaurant,"prince mamdooh bin abdulaziz, Riyadh 12241, ال...",Cheap,6.7
3,tsc,Restaurant,"alshifa, الرياض 14725, المملكة العربية السعودية",Moderate,7.6
4,domino's pizza,Pizza Place,"Alshifaa Plaza (Imam Muslim), الرياض 14725, ال...",Cheap,7.3


In [40]:
df_cleaned[["cuisine1", "cuisine2",'cuisine3']] = (  # Create two new features
    df_cleaned["categories"]           # from the cuisine feature
    .str                         # through the string accessor
    .split(",", expand=True)     # by splitting on ","
                                 # and expanding the result into separate columns
)
# df.rename(columns ={'price for one': 'price'},inplace=True)
df_cleaned.head()

Unnamed: 0,name,categories,address,price,rating,cuisine1,cuisine2,cuisine3
0,dunkin',Donut Shop,"الرياض, المملكة العربية السعودية",Cheap,8.9,Donut Shop,,
1,starbucks,Coffee Shop,"الرياض 14723, المملكة العربية السعودية",Cheap,8.1,Coffee Shop,,
2,burger king,Fast Food Restaurant,"prince mamdooh bin abdulaziz, Riyadh 12241, ال...",Cheap,6.7,Fast Food Restaurant,,
3,tsc,Restaurant,"alshifa, الرياض 14725, المملكة العربية السعودية",Moderate,7.6,Restaurant,,
4,domino's pizza,Pizza Place,"Alshifaa Plaza (Imam Muslim), الرياض 14725, ال...",Cheap,7.3,Pizza Place,,


In [41]:
features = ["cuisine1", "cuisine2",'cuisine3']

In [42]:
df_cleaned["temp"] = df_cleaned[features].isnull().sum(axis=1)
df_cleaned["no_Of_cusines"] = 3- df_cleaned["temp"] 

In [43]:
df_cleaned.head()

Unnamed: 0,name,categories,address,price,rating,cuisine1,cuisine2,cuisine3,temp,no_Of_cusines
0,dunkin',Donut Shop,"الرياض, المملكة العربية السعودية",Cheap,8.9,Donut Shop,,,2,1
1,starbucks,Coffee Shop,"الرياض 14723, المملكة العربية السعودية",Cheap,8.1,Coffee Shop,,,2,1
2,burger king,Fast Food Restaurant,"prince mamdooh bin abdulaziz, Riyadh 12241, ال...",Cheap,6.7,Fast Food Restaurant,,,2,1
3,tsc,Restaurant,"alshifa, الرياض 14725, المملكة العربية السعودية",Moderate,7.6,Restaurant,,,2,1
4,domino's pizza,Pizza Place,"Alshifaa Plaza (Imam Muslim), الرياض 14725, ال...",Cheap,7.3,Pizza Place,,,2,1


In [44]:
# Perform the search using boolean indexing
search_result = df_cleaned[df_cleaned['name'] == "kfc"]

# Print the search result
print(search_result)

  name                                 categories                   address  \
6  kfc  Fast Food Restaurant, Fried Chicken Joint  المملكة العربية السعودية   

   price  rating              cuisine1              cuisine2 cuisine3  temp  \
6  Cheap     6.6  Fast Food Restaurant   Fried Chicken Joint     None     1   

   no_Of_cusines  
6              2  


In [45]:
print(df_cleaned)

                                    name            categories  \
0                                dunkin'            Donut Shop   
1                              starbucks           Coffee Shop   
2                            burger king  Fast Food Restaurant   
3                                    tsc            Restaurant   
4                         domino's pizza           Pizza Place   
...                                  ...                   ...   
1466                 arabica star coffee           Coffee Shop   
1467              butlers chocolate café                  Café   
1468                         upper crust        Sandwich Place   
1469  upper crust- kkia “riyadh airport”                  Café   
1470                      mammaroti cafe                  Café   

                                                address     price  rating  \
0                      الرياض, المملكة العربية السعودية     Cheap     8.9   
1                الرياض 14723, المملكة العربية السعود

In [46]:
df_cleaned.to_csv("resturants.csv", index=False)

In [31]:
# Assuming 'df' is your DataFrame with 'name', 'cuisine1', 'cuisine2', 'cuisine3' columns
features = df_cleaned[['cuisine1', 'cuisine2', 'cuisine3']].astype(str).apply(lambda x: ' '.join(x), axis=1).tolist()

# Create the TfidfVectorizer with multiple stop words for both Arabic and English
tfidf = TfidfVectorizer(stop_words= "english")
tfidf_matrix = tfidf.fit_transform(features)
similarity = cosine_similarity(tfidf_matrix)

# Create indices dictionary
indices = pd.Series(df_cleaned.index, index=df_cleaned['name']).to_dict()

# Example usage of the restaurant_recommendation function
def restaurant_recommendation(name, similarity_matrix=similarity, indices=indices):
    name = name.lower()

    if name not in indices:
        print(f"Error: {name} not found in the indices dictionary.")
        return pd.DataFrame()

    index = indices[name]
    similarity_scores = list(enumerate(similarity_matrix[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Exclude the input restaurant itself
    similarity_scores = similarity_scores[1:11] if len(similarity_scores) > 1 else similarity_scores

    restaurant_indices = [i[0] for i in similarity_scores]
    
    print(f"Similarity Scores: {similarity_scores}")
    
    return df_cleaned.iloc[restaurant_indices]


# Example usage
name_to_recommend = "Kfc"
recommendations = restaurant_recommendation(name_to_recommend)

# Define the desired columns
columns_to_display = ['name', 'address', 'price', 'rating', 'cuisine1', 'cuisine2', 'cuisine3']

# Filter and display relevant columns
formatted_recommendations = recommendations[columns_to_display]

# Filter and display relevant columns
formatted_recommendations = recommendations[columns_to_display]

# Print the custom-formatted table
for _, row in formatted_recommendations.iterrows():
    print(f"Name: {row['name']}")
    print(f"Address: {row['address']}")
    print(f"Price: {row['price']}")
    print(f"Rating: {row['rating']}")
    print(f"Cuisine 1: {row['cuisine1']}")
    print(f"Cuisine 2: {row['cuisine2']}")
    print(f"Cuisine 3: {row['cuisine3']}")
    print('-' * 10)  # Separator for better readability

Similarity Scores: [(100, 1.0000000000000002), (635, 1.0000000000000002), (390, 0.8716060539559102), (84, 0.8228131921947772), (583, 0.8228131921947772), (648, 0.8228131921947772), (729, 0.8228131921947772), (810, 0.8228131921947772), (1082, 0.8228131921947772), (1300, 0.8228131921947772)]
Name: shallal restaurant
Address: Mansoura District (Kharj Road), الرياض, المملكة العربية السعودية
Price: Cheap
Rating: 8.2
Cuisine 1: Fried Chicken Joint
Cuisine 2:  Fast Food Restaurant
Cuisine 3: None
----------
Name: texas chicken
Address: King Abdullah Rd., الرياض, المملكة العربية السعودية
Price: Cheap
Rating: 6.4
Cuisine 1: Fast Food Restaurant
Cuisine 2:  Fried Chicken Joint
Cuisine 3: None
----------
Name: japang
Address: الرياض 12221, المملكة العربية السعودية
Price: Cheap
Rating: 4.8
Cuisine 1: Fried Chicken Joint
Cuisine 2:  Japanese Restaurant
Cuisine 3:  Fast Food Restaurant
----------
Name: broasted al najah
Address: As Suwaidi Al Am. (Kaab Bin Zuhair St.), الرياض, المملكة العربية السعود