https://www.kaggle.com/datasets/abhijitdahatonde/zomato-restaurants-dataset

In [None]:
import pandas as pd
df = pd.read_csv('/content/zomato.csv')

1. Dropping missing values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

Unnamed: 0.1              0
Unnamed: 0                0
restaurant name           0
restaurant type           0
rate (out of 5)          68
num of ratings            0
avg cost (two people)    57
online_order              0
table booking             0
cuisines type             0
area                      0
local address             0
dtype: int64


In [None]:
new_df = df.dropna()

In [None]:
print(new_df.isnull().sum())

Unnamed: 0.1             0
Unnamed: 0               0
restaurant name          0
restaurant type          0
rate (out of 5)          0
num of ratings           0
avg cost (two people)    0
online_order             0
table booking            0
cuisines type            0
area                     0
local address            0
dtype: int64


2. Filling missing values with mean

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

Unnamed: 0.1              0
Unnamed: 0                0
restaurant name           0
restaurant type           0
rate (out of 5)          68
num of ratings            0
avg cost (two people)    57
online_order              0
table booking             0
cuisines type             0
area                      0
local address             0
dtype: int64


In [None]:
# Example: Replace missing values in 'Rate' column with median
median_rate = df['rate (out of 5)'].median()
new_df2 = df['rate (out of 5)'].fillna(median_rate)

In [None]:
print(new_df2.isnull().sum())

0


3. Standardizing Text

In [None]:
# Convert restaurant name and cuisines type to lowercase
df['restaurant name'] = df['restaurant name'].str.lower()
df['cuisines type'] = df['cuisines type'].str.lower()

# Remove special characters from restaurant type
df['restaurant type'] = df['restaurant type'].str.replace('[^\w\s]', '')

# Remove extra whitespaces
df['restaurant type'] = df['restaurant type'].str.strip()

4. Removing Duplicates

In [None]:
# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print("Number of duplicate rows:", duplicate_rows)

# Remove duplicate rows
newdf3 = df.drop_duplicates()


Number of duplicate rows: 0


5. Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalize 'Avg Cost' column
scaler = MinMaxScaler()
df['Avg Cost Normalized'] = scaler.fit_transform(df[['avg cost (two people)']])


In [None]:
df.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,restaurant name,restaurant type,rate (out of 5),num of ratings,avg cost (two people),online_order,table booking,cuisines type,...,"Cuisine_tibetan, nepalese","Cuisine_tibetan, nepalese, momos","Cuisine_turkish, chinese","Cuisine_turkish, desserts","Cuisine_turkish, fast food, biryani, chinese","Cuisine_turkish, rolls",Cuisine_vietnamese,"Cuisine_vietnamese, salad","Cuisine_vietnamese, thai, burmese, japanese",Avg Cost Normalized
0,0,0,#feeltheroll,Quick Bites,3.4,7,200.0,No,No,fast food,...,False,False,False,False,False,False,False,False,False,0.026846
1,1,1,#l-81 cafe,Quick Bites,3.9,48,400.0,Yes,No,"fast food, beverages",...,False,False,False,False,False,False,False,False,False,0.060403
2,2,2,#refuel,Cafe,3.7,37,400.0,Yes,No,"cafe, beverages",...,False,False,False,False,False,False,False,False,False,0.060403
3,3,3,'@ biryani central,Casual Dining,2.7,135,550.0,Yes,No,"biryani, mughlai, chinese",...,False,False,False,False,False,False,False,False,False,0.08557
4,4,4,'@ the bbq,Casual Dining,2.8,40,700.0,Yes,No,"bbq, continental, north indian, chinese, bever...",...,False,False,False,False,False,False,False,False,False,0.110738


6. Dropping unneccesary columns

In [None]:
# Remove 'Cuisines Type' column (assuming it's not needed anymore)
df.drop('Cuisines Type', axis=1, inplace=True)

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text


In [None]:
# Apply preprocess_text function to 'review' column
df['clean_review'] = df['review'].apply(preprocess_text)

# Display the preprocessed data
print(flipkart_data[['review', 'clean_review']].head())