In [59]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [60]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
#reading csv
data = pd.read_csv('../input/supermarket-sales/supermarket_sales - Sheet1.csv')

## Basic checkups

In [62]:
data.head()

In [63]:
#checking nulls
data.isnull().sum()

In [64]:
data.shape

In [65]:
#checking columns and the datatypes
data.info()

There are no null values. Notice that the datatype of 'Date' column is object and not datetime. We need to change this. 
Also, the column names have blank spaces. I prefer '_' over blank spaces.

## Data Cleaning

In [66]:
#dropping duplicates, if any
data.drop_duplicates()

In [67]:
#changing the 'Date' datatype
data['Date'] = pd.to_datetime(data['Date'])

In [68]:
#removing blank spaces from column names
data.columns = data.columns.str.replace(' ','_')

In [69]:
data.info()

## EDA

I'll divide this section mainly centered amongst three categories - 1)Branch Performance, 2)Customer types and Gender and 3)Payment methods and Product lines. I'll also try to see whether if the members are generally spending more and if they are happier.

#### Branch performance

In [70]:
# Let's find which branch was most successful in terms of Total (revenue?) and Ratings
data.groupby('Branch')['Total'].sum().sort_values(ascending=False)

In [71]:
data.groupby('Branch')['Rating'].mean().sort_values(ascending=False)

In [72]:
# Which Branch gets the higher ratings from the Members
member_rating = data[data.Customer_type == 'Member']
member_rating.groupby('Branch')['Rating'].mean().sort_values(ascending=False)

In [73]:
# Which Branch gets the overall better rating
data.groupby('Branch')['Rating'].mean().sort_values(ascending=False)

In [74]:
#average spend on each branch
sns.barplot(x = 'Branch', y = 'Total', data = data)

In [76]:
data.groupby(['Branch', 'Month'])['Total'].sum().to_frame()

In [75]:
#Let's see how the sales has been in the three months across the three branches
data['Month'] = data.Date.dt.month

sns.barplot(x='Branch', y='Total', hue='Month', data=data)


Clearly, Branch C seems to be performing better both in terms of Total sales and Ratings.

#### Customer types and Gender

In [77]:
#average rating
data['Rating'].mean()

In [78]:
data.groupby('Customer_type')['Rating'].mean()

In [79]:
sns.barplot(x='Customer_type', y = 'Total', data = data)

Members on average spend a little more than non members, but the ratings are not better. It appears that the membership program has not been quite successful.

In [80]:
plt.subplots(figsize=(12,8))
sns.countplot(x= 'Gender', hue= 'Customer_type', data=data)

In [81]:
#Let's analyze average rating, spend 
data.groupby('Gender')['Rating'].mean()

In [82]:
# Average spend by Male and Female
sns.barplot(x='Gender', y='Total', data=data)

While the ratings are similar, Female customers on average tend to spend more than male counterpart.

Also, number of female members > number of male members .

#### Payment methods and Product line

In [83]:
data.groupby('Payment')['Total'].mean().to_frame()

In [84]:
data.groupby('Payment')['Rating'].mean()

In [85]:
data.groupby('Payment')['Invoice_ID'].count()

Customers seem to prefer cash and Ewallet payments over credit card. They don't mind paying high amounts through cash. The ratings are similar for each.

In [86]:
# Now, we look at the ratings across the product line
data.groupby('Product_line')['Rating'].mean().sort_values(ascending=False).to_frame()

In [87]:
plt.subplots(figsize=(12,8))
sns.countplot(x= 'Product_line', data = data)

In [88]:
#Since the supermarkets average rating is approx. 7, let's see what group of products get's the best ratings
best_prod = data[data.Rating >=8]
best_prod.groupby('Product_line')['Rating'].count().sort_values(ascending=False)

Food and beverages and Fashion accessories are the top 2 product lines in terms of better ratings and number of products sold.

### Conclusion
This concludes my first EDA on Python. 
There could still be so much to uncover in this dataset, but I will call this an end. Feel free to leave any feedbacks.
Thank you!