In [1]:
import pandas as pd



In [5]:
data = pd.read_csv("/home/hgidea/Desktop/Coding/Python/internship/cognifz/Dataset .csv")
data.head()

In [6]:
data.shape

In [7]:
# Check for missing values
null_counts = data.isnull().sum()
print("Number of missing values in each column:")
print(null_counts)

In [8]:
# Fill missing values in the 'Cuisines' column with zeros
data['Cuisines'].fillna(0, inplace=True)


In [9]:
data.columns

In [10]:
df= data[['Restaurant ID','Restaurant Name','Cuisines','Aggregate rating','Votes']]

In [11]:
df

In [12]:
def data_desc(df):
    # Initialize an empty list to store column descriptions
    columns_info = []

    # Iterate through each column in the DataFrame
    for column in df.columns:
        # Gather basic information about the column
        col_info = {
            'Column': column,
            'Data Type': df[column].dtype,
            'Missing Value': df[column].isna().sum(),
            'Pct Missing Value': round(df[column].isna().sum() / len(df) * 100, 2),
            'Num Unique': df[column].nunique(),
            'Unique Sample': list(df[column].drop_duplicates().sample(2).values)
        }

        # Append the column information to the list
        columns_info.append(col_info)

    # Create a DataFrame from the list of column descriptions
    desc_df = pd.DataFrame(columns_info)

    return desc_df



In [13]:
data_desc(df)

In [14]:
data_desc(df).shape


In [15]:
# Check for Duplicates
df.duplicated().sum()

In [16]:
df

In [17]:
df['Restaurant Name'].duplicated().sum()

In [18]:
df['Restaurant Name'].value_counts()

In [19]:
# Remove duplicate entries based on the 'Restaurant Name' column
df_unique = df.drop_duplicates(subset='Restaurant Name')

# Check the number of unique restaurants after removing duplicates
num_unique_restaurants = len(df_unique)
print("Number of unique restaurants:", num_unique_restaurants)


In [20]:
# Check if there are any columns other than 'Restaurant Name' that might be unique for each restaurant
unique_cols = [col for col in df.columns if col != 'Restaurant Name' and df[col].nunique() > df['Restaurant Name'].nunique()]

if len(unique_cols) == 0:
  # No other columns are likely to differentiate restaurants, so dropping duplicates might be safe
  print("Dropping all but one row for each duplicate restaurant name...")
  df.drop_duplicates(subset='Restaurant Name', inplace=True)
else:
  print(f"Warning: Columns like {unique_cols} might contain unique information for each restaurant. Dropping duplicates based on name alone might lead to data loss.")



In [21]:
df['Restaurant Name'].value_counts()

In [22]:
# Drop duplicate entries based on the 'Restaurant Name' column and keep the first occurrence
df_unique2 = df.drop_duplicates(subset='Restaurant Name', keep='first')

# Check the number of unique restaurants after removing duplicates
num_unique_restaurants2 = len(df_unique2)
print("Number of unique restaurants:", num_unique_restaurants2)


In [23]:
df['Restaurant Name'].value_counts()

In [24]:
# Remove duplicate entries based on the 'Restaurant Name' column and keep the first occurrence
df= df.drop_duplicates(subset='Restaurant Name', keep='first')

# Check the number of unique restaurants after removing duplicates
num_unique_restaurants = len(df)
print("Number of unique restaurants:", num_unique_restaurants)

# Print the value counts of 'Restaurant Name' to verify that duplicates are removed
print(df['Restaurant Name'].value_counts())


In [25]:
import matplotlib.pyplot as plt

# Plot histogram of ratings
plt.figure(figsize=(10, 6))
plt.hist(df_unique['Aggregate rating'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [26]:
# Filter restaurants with a rating greater than 4
highly_rated_restaurants = df_unique[df_unique['Aggregate rating'] > 4]

# Display the highly rated restaurants
print(highly_rated_restaurants[['Restaurant Name', 'Aggregate rating']])


In [27]:
df['Aggregate rating'].value_counts()

In [28]:
df

In [29]:
# Convert 'Cuisines' column to string
df['Cuisines'] = df['Cuisines'].astype(str)

# Split Cuisines into list
df['Cuisines'] = df['Cuisines'].str.split(', ')

# Display the DataFrame
df

In [30]:
# Explode the 'Cuisines' column
df = df.explode('Cuisines')

# Display the DataFrame
df

In [31]:
# Cuisines Check
df['Cuisines'].value_counts()

In [32]:
df.columns

In [61]:
df

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join the cuisines back into a single string per restaurant
df['Cuisines'] = df.groupby('Restaurant ID')['Cuisines'].transform(lambda x: ' '.join(x))

# Drop duplicates again after concatenation
df = df.drop_duplicates(subset=['Restaurant ID'])

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Apply TF-IDF to the 'Cuisines' column
tfidf_matrix = tfidf.fit_transform(df['Cuisines'])


In [56]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a mapping of restaurant ID to index
indices = pd.Series(df.index, index=df['Restaurant Name']).drop_duplicates()


In [57]:
def get_recommendations(user_cuisine, user_rating, cosine_sim=cosine_sim):
    # Filter the dataframe based on the user's rating preference
    if user_rating == 'cheap':
        rating_filter = (df['Aggregate rating'] < 3)
    elif user_rating == 'moderate':
        rating_filter = (3 <= df['Aggregate rating']) & (df['Aggregate rating'] < 4)
    elif user_rating == 'shine':
        rating_filter = (df['Aggregate rating'] >= 4)

    filtered_df = df[rating_filter]

    # Check if there are restaurants matching the criteria
    if filtered_df.empty:
        return "No restaurants found matching your criteria."

    # Apply TF-IDF on the filtered dataframe
    tfidf_matrix_filtered = tfidf.transform(filtered_df['Cuisines'])
    cosine_sim_filtered = cosine_similarity(tfidf_matrix_filtered, tfidf_matrix)

    # Calculate the similarity scores for the user's preferred cuisine
    idx = None
    for i in range(len(filtered_df)):
        if user_cuisine.lower() in filtered_df.iloc[i]['Cuisines'].lower():
            idx = i
            break

    if idx is None:
        return "No restaurants found matching your cuisine preference."

    sim_scores = list(enumerate(cosine_sim_filtered[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5 recommendations
    restaurant_indices = [i[0] for i in sim_scores]

    # Return the top 5 most similar restaurants
    return filtered_df.iloc[restaurant_indices][['Restaurant Name', 'Cuisines', 'Aggregate rating', 'Votes']]



In [60]:
# Example usage
user_cuisine = input("Enter your preferred cuisine: ")
user_rating = input("Enter your preferred rating category (cheap, moderate, shine): ")
recommendations = get_recommendations(user_cuisine, user_rating)
recommendations
