In [None]:
import pandas as pd
import re

In [None]:
zomato_data = 'restaurants1\zomato.csv'
yelp_data = 'restaurants1\yelp.csv'

In [None]:
zomato_df = pd.read_csv(zomato_data)
yelp_df = pd.read_csv(yelp_data)

In [None]:
###  Remove duplicate entries (Places with the same values in the NAME and ADDRESS fields)
print(len(zomato_df), 'before removing zomato duplicates')
zomato_df.drop_duplicates(['NAME', 'ADDRESS'], inplace=True)
print(len(zomato_df), 'before removing zomato duplicates')
print(len(yelp_df), 'before removing yelp duplicates')
yelp_df.drop_duplicates(['NAME', 'ADDRESS'], inplace=True)
print(len(yelp_df), 'before removing yelp duplicates')

In [None]:
###  Remove restaurants with "review of" in their name

zomato_df[~zomato_df['NAME'].str.contains('review of', case=False)]
yelp_df[~yelp_df['NAME'].str.contains('review of', case=False)]

In [None]:
### Fix special characters

In [None]:
### Remove entries with numbers instead of names

In [None]:
### Fix places containing a '-' in their name only contain characters before the '-' sign as name value

zomato_df['NAME'] = zomato_df['NAME'].str.split('-').str[0]
yelp_df['NAME'] = yelp_df['NAME'].str.split('-').str[0]

In [None]:
###  Remove places sharing the same phone number. 

zomato_df.drop_duplicates(['PHONENUMBER'], inplace=True)
yelp_df.drop_duplicates(['PHONENUMBER'], inplace=True)

In [None]:
def clean_phone_number(phone_number):
    pattern = r'[^\d+]'
    cleaned_number = re.sub(pattern, '', phone_number)

    return cleaned_number

zomato_df['PHONENUMBER'] = [clean_phone_number(number) for number in zomato_df['PHONENUMBER']]
yelp_df['PHONENUMBER'] = [clean_phone_number(number) for number in yelp_df['PHONENUMBER']]

In [None]:
###  Fix NaN values in ratings.

mean_rating = zomato_df['RATING'].mean()
zomato_df['RATING'] = zomato_df.apply(lambda row: 0 if row['NO_OF_REVIEWS'] == 0 else mean_rating if pd.isna(row['RATING']) else row['RATING'], axis=1)

mean_rating = yelp_df['RATING'].mean()
yelp_df['RATING'] = yelp_df.apply(lambda row: 0 if row['NO_OF_REVIEWS'] == 0 else mean_rating if pd.isna(row['RATING']) else row['RATING'], axis=1)

yelp_df

In [None]:
### Trim restaurant names.
 
zomato_df['NAME'] = zomato_df['NAME'].str.strip()
yelp_df['NAME'] = yelp_df['NAME'].str.strip()

In [None]:
split_address = zomato_df['ADDRESS'].str.rsplit(', ', n=2, expand=True)
zomato_df['CITY'] = split_address[1]
zomato_df['STATE'] = split_address[2]

split_address = yelp_df['ADDRESS'].str.rsplit(', ', n=3, expand=True)
yelp_df['CITY'] = split_address[1]
yelp_df[['STATE', 'ZIP CODE']] = split_address[2].str.rsplit(' ', n=2, expand=True)

In [None]:
city_frequency_zomato = zomato_df['CITY'].value_counts()
city_frequency_yelp = yelp_df['CITY'].value_counts()

zomato_df['CITY_FREQUENCY'] = zomato_df['CITY'].map(city_frequency_zomato)
yelp_df['CITY_FREQUENCY'] = yelp_df['CITY'].map(city_frequency_yelp)

In [None]:
# Identify popular restaurant chains based on the name

chain_threshold = 3  # Set a threshold for considering a restaurant a chain
zomato_df['IS_CHAIN'] = zomato_df['NAME'].map(zomato_df['NAME'].value_counts()) > chain_threshold
yelp_df['IS_CHAIN'] = yelp_df['NAME'].map(yelp_df['NAME'].value_counts()) > chain_threshold

zomato_df[zomato_df['IS_CHAIN']]