In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
def read_data_from_csv():
    
    hotels=pd.read_csv("D:\Msc Data Science and Analytics\Practise Datasets\zomato.csv")
    
    return hotels

In [3]:
def remove_unwanted_columns():
    
    #call read_data_from_csv() function to get dataframe
    hotels=read_data_from_csv()
    hotels = hotels.drop(['address','phone'], axis = 1)
    
    return hotels

In [4]:
# 56252 rows
def rename_columns():
    
    #call remove_unwanted_columns() function to get dataframe
    hotels = remove_unwanted_columns()
    hotels.rename(columns = {'name':'name'}, inplace=True)
    hotels.rename(columns = {'rate':'rating'}, inplace=True)
    hotels.rename(columns = {'approx_cost(for two people)':'approx_cost'}, inplace = True)
    hotels.rename(columns = {'listed_in(type)':'type'}, inplace = True)
    
    return hotels

In [5]:
#task3: handle  null values of each column 56236 records after running this fn
def null_value_check():
    
    #call rename_columns() function to get dataframe
    hotels=rename_columns()
    
    #deleting null values of name column
    hotels = hotels.dropna(subset=['name'])
    #handling null values of online_order
    hotels["online_order"].fillna("NA", inplace = True)
    #handling null values of book_table
    hotels["book_table"].fillna("NA", inplace = True)
    #handling null values of rating
    hotels['rating'] = hotels['rating'].fillna(0)
    #handling null values of votes
    hotels['votes'] = hotels['votes'].fillna(0)
    #handling null values of location
    hotels["location"].fillna("NA", inplace = True)
    #handling null values of rest_type
    hotels["rest_type"].fillna("NA", inplace = True)
    #handling null values of dishliked
    hotels["dish_liked"].fillna("NA", inplace = True)
    #handling null values of cuisines
    hotels["cuisines"].fillna("NA", inplace = True)
    #handling null values of approxcost
    hotels['approx_cost'] = hotels['approx_cost'].fillna(0)
    #handling null values of type
    hotels["type"].fillna("NA", inplace = True)
       
    return hotels

In [6]:
#task4 #find duplicates in the dataset -38413 records after running this fn
def find_duplicates():
    
    #call null_value_check() function to get dataframe
    hotels=null_value_check()
    #droping the duplicates value keeping the first
    hotels = hotels.drop_duplicates(keep = 'first')

    return hotels

In [7]:
#task5 removing irrelevant text from all the columns
def removing_irrelevant_text():
    
    #call find_duplicates() function to get dataframe
    hotels= find_duplicates()
    hotels = hotels[hotels['name'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['online_order'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['book_table'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['rating'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['votes'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['location'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['rest_type'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['dish_liked'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['cuisines'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['approx_cost'].str.contains('RATED|Rated') == False]
    hotels = hotels[hotels['type'].str.contains('RATED|Rated') == False]
    
    return hotels

In [8]:
#task6: check for unique values in each column and handle the irrelevant values
def check_for_unique_values():
    
    #call removing_irrelevant_text() function to get dataframe
    hotels = removing_irrelevant_text()
    hotels['approx_cost'].unique()
    hotels["type"].unique()
    hotels["rest_type"].unique()
    hotels["online_order"].unique()
    hotels["cuisines"].unique()
    hotels["votes"].unique()
    hotels["cuisines"].unique()
    hotels["dish_liked"].value_counts()
    hotels["book_table"].unique()
    
    hotels = hotels[hotels['online_order'].isin(['Yes', 'No'])]
    hotels['rating'].replace(['NEW', '-'], 0, inplace=True)
    # get the unique values in the Rating column
    unique_ratings = hotels['rating'].unique()
    
    # define the mapping dictionary
    mapping = {}

    for rating in unique_ratings:
        if rating is None:
            mapping[rating] = 0
        elif isinstance(rating, int):
            mapping[rating] = rating
        else:
            mapping[rating] = rating.split('/')[0]

    # apply the mapping using map()
    hotels['rating'] = hotels['rating'].map(mapping)
    
    return hotels

In [9]:
#task7: remove the unknown character from the dataset and export it to "zomatocleaned.csv"
def remove_the_unknown_character():

    #call check_for_unique_values() function to get dataframe
    dataframe=check_for_unique_values()


    #remove unknown character from dataset
    # replacing the Ã character with an empty string in the 'name' column
    dataframe['name'] = dataframe['name'].str.replace('[Ãx][^A-Za-z]+','',regex=True)
    dataframe['approx_cost'] = dataframe['approx_cost'].str.replace(',','', regex = True)
    #export cleaned Dataset to newcsv file named "zomatocleaned.csv"
    dataframe.to_csv('zomatocleaned.csv')
    
    return dataframe

In [10]:
df = remove_the_unknown_character()

In [11]:
#30336 x 11 records
df

Unnamed: 0,name,online_order,book_table,rating,votes,location,rest_type,dish_liked,cuisines,approx_cost,type
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,Buffet
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,Buffet
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,Buffet
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,Buffet
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,Buffet
...,...,...,...,...,...,...,...,...,...,...,...
56244,The Farm House Bar n Grill,No,No,3.7,34,Whitefield,"Casual Dining, Bar",,"North Indian, Continental",800,Pubs and bars
56246,Bhagini,No,No,2.5,81,Whitefield,"Casual Dining, Bar","Biryani, Andhra Meal","Andhra, South Indian, Chinese, North Indian",800,Pubs and bars
56247,Best Brews - Four Points by Sheraton Bengaluru...,No,No,3.6,27,Whitefield,Bar,,Continental,1500,Pubs and bars
56250,Chime - Sheraton Grand Bengaluru Whitefield Ho...,No,Yes,4.3,236,"ITPL Main Road, Whitefield",Bar,"Cocktails, Pizza, Buttermilk",Finger Food,2500,Pubs and bars
