In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
class DataPreprocessing:
    
    def loadDataSet(self,path):
        print("Loading DataSet...")
        dataFrame = pd.read_csv(path)
        print("DataSet loaded successfully...")
        return dataFrame
    
    def preprocessingPhase(self,dataSet):
        self.plotsToUnderstandData(dataSet)
        cleanedDataSet = self.dataCleaningToRemoveDublicateData(dataSet)
        return cleanedDataSet
    
    def plotsToUnderstandData(self,dataSet):
        print("Graph of number of people who order online on Zomato")
        onlineOrderValueCount = dataSet['online_order'].value_counts()
        sns.set(rc={'figure.figsize':(13.7,9.27)})
        fig = onlineOrderValueCount.plot(kind='bar')
        plt.margins(0.02)
        plt.ylabel('Count')
        plt.xlabel('Online Order')
        plt.show()
        
        print("Graph of number of people who book table on Zomato")
        onlineOrderValueCount = dataSet['book_table'].value_counts()
        sns.set(rc={'figure.figsize':(13.7,9.27)})
        fig = onlineOrderValueCount.plot(kind='bar')
        plt.margins(0.02)
        plt.ylabel('Book Table on Zomato')
        plt.xlabel('Count')
        plt.show()
        
        print("Graph of ratings of people vs count")
        onlineOrderValueCount = dataSet['rate'].value_counts()
        sns.set(rc={'figure.figsize':(18.7,14.27)})
        fig = onlineOrderValueCount.plot(kind='bar')
        plt.margins(0.02)
        plt.ylabel('Your y-label')
        plt.xlabel('Your x-label')
        plt.show()
        
        print("Graph of number of restaurants in a particular location")
        onlineOrderValueCount = dataSet['location'].value_counts()
        sns.set(rc={'figure.figsize':(18.7,14.27)})
        fig = onlineOrderValueCount.plot(kind='bar')
        plt.margins(0.02)
        plt.ylabel('Your y-label')
        plt.xlabel('Your x-label')
        plt.show()
        
        print("Conclusions drawn from the graph are:")
        print("TODO and analyze graphs in more different ways")
        
    def dataCleaningToRemoveDublicateData(self, dataSet):
        cleanedDataSet = dataSet.drop_duplicates(subset=['address','listed_in(type)','listed_in(city)'],keep=False);
        cleanedDataSet.dropna(inplace=True)
        return cleanedDataSet
    
    def determineNeighbourWithHighestRating(self, cleanedDataSet):
        dataSetWithRatingsAndCities = pd.DataFrame()
        print("Calculating the neightbourhood with the highest rating...")
        dataSetWithRatingsAndCities['location'] = cleanedDataSet['location']
        dataSetWithRatingsAndCities['rest_type'] = cleanedDataSet['rest_type']
        dataSetWithRatingsAndCities['rate'] = cleanedDataSet['rate']
        dataSetWithRatingsAndCities['cuisines'] = cleanedDataSet['cuisines']
        dataSetWithRatingsAndCities['votes'] = cleanedDataSet['votes']
        dataSetWithRatingsAndCities['rest_type'] = cleanedDataSet['rest_type']
        
        mapLocationWithAverageRating = {}
        mapLocationWithCuisines = {}
        mapLocationWithNumberOfVotes = {}
        mapLocationWithRestaurantType = {}
        for index, row in dataSetWithRatingsAndCities.iterrows():
           try:
               ratingValue = float(row['rate'].split('/')[0]) 
               if row['location'] in mapLocationWithAverageRating:
                   mapLocationWithAverageRating[row['location']] += ratingValue /2
                   mapLocationWithCuisines[row['location']] += row['cuisines']
                   mapLocationWithRestaurantType[row['location']] += " , "+row['rest_type']
               else:        
                   mapLocationWithAverageRating[row['location']] = ratingValue
                   mapLocationWithCuisines[row['location']] = row['cuisines']
                   mapLocationWithRestaurantType[row['location']] = row['rest_type']
           except ValueError:
               exception = "exception"
        
        avgRatingLIst = []
        avgRatingListavgRatingList = sorted(mapLocationWithAverageRating.items(), key=lambda x:x[1], reverse=True)
        highestRatingNeighbourhood = avgRatingListavgRatingList[0][0];
        print("Highest Rating Neighbourhood is"+ highestRatingNeighbourhood)
        print("-----------------------------------------------------------")
        print("Cuisines famous in the area are: "+mapLocationWithCuisines[highestRatingNeighbourhood])
        print("-----------------------------------------------------------")
        print("Type of restaurants famous in the area are: "+ mapLocationWithRestaurantType[highestRatingNeighbourhood])