# Task 3 : Performing EDA On Sample Superstore Dataset
# Submitted by : Jainil Shah
# You are business manager.As a business manager,you want to find out the weak areas where you can work to make more profit.
## 1. Importing Libraries

In [None]:
# used for working with arrays
import numpy as np 
# it offers data structures and operations for manipulating numerical tables and time series
import pandas as pd 
#  data visualization and graphical plotting
import matplotlib.pyplot as plt
# provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
# high-level Python visualization library : wrapper for Plotly.py that exposes a simple syntax for complex charts.

# 2. Importing the dataset

In [None]:
df = pd.read_csv('D://Machine Learning//Task 3 - Dataset.csv')
df.head()

# 3.Shape of the dataset

In [None]:
# print rows and columns number count
print('Number of rows: ',df.shape[0])
print('Number of columns: ',df.shape[1])

# 4.column names of table

In [None]:
# gives names of columns
df.columns

# 5. Check the attribute types

In [None]:
#check data type of columns/attributes
df.dtypes

# 6.Check the missing values

In [None]:
df.isnull().sum()

# 7.Check the dataset for duplicate

In [None]:
# gives count of duplicates
df.duplicated().sum()

# 8. drop duplicates

In [None]:
# drops duplicates
df.drop_duplicates()

# 9.Checking the unique values in columns

In [None]:
# return number of unique elements in the object
df.nunique()

# 10. Correlation

In [None]:
# find the pairwise correlation of all columns in the dataframe
df.corr()

In [None]:
# A correlation heatmap uses colored cells, 
# typically in a monochromatic scale, 
# to show a 2D correlation matrix (table) between two discrete dimensions
fig,axes = plt.subplots(1,1,figsize=(9,6))
sns.heatmap(df.corr(), annot= True)
plt.show()

# 11.Covariance

In [None]:
# property of a function of retaining its form when the variables are linearly transformed.: Covariance
# compute pairwise covariance of columns
#  returned data frame is the covariance matrix of the columns of the DataFrame
df.cov()

In [None]:
fig,axes = plt.subplots(1,1,figsize=(9,6))
sns.heatmap(df.cov(), annot= True)
plt.show()

# 12.Find the Series containing counts of unique values

In [None]:
# flatten: return a copy of the array collapsed into one dimension
pd.value_counts(df.values.flatten())

# 13.Deleting the Variable

In [None]:
col=['Postal Code']
df1=df.drop(columns=col,axis=1)

# 14.Visualizing the dataset

In [None]:
# presents categorical data with rectangular bars with heights or lengths proportional to the values that they represent.
# The bars can be plotted vertically or horizontally.
plt.figure(figsize=(16,8))
plt.bar('Sub-Category','Category', data=df)
plt.show()

In [None]:
print(df1['State'].value_counts())

In [None]:
# Show the counts of observations in each categorical bin using bars
plt.figure(figsize=(15,8))
sns.countplot(x=df1['State'])
plt.xticks(rotation=80)
plt.show()


In [None]:
print(df['Sub-Category'].value_counts())

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x=df['Sub-Category'])
plt.xticks(rotation=80)
plt.show()

In [None]:
plt.figure(figsize=(40,30))
sns.barplot(x=df['Sub-Category'], y=df['Profit'],ci=False)

In [None]:
sns.countplot(x=df['Segment'])

In [None]:
sns.countplot(x=df['Region'])

In [None]:
# display numerical values on one axis, and categorical values on the other.
plt.figure(figsize = (10,4))
sns.lineplot('Discount', 'Profit', data = df, color = 'r', label= 'Discount',ci=False)
plt.legend()

In [None]:
# bins should be the number of bars you want to show in your histogram plot.
df1.hist(bins=50 ,figsize=(20,15))
plt.show()

In [None]:
# pairplot plot a pairwise relationships in a dataset
# creates a grid of Axes such that each variable in data will by shared in the y-axis across a single row
# and in the x-axis across a single column.
figsize=(30,30)
sns.pairplot(df1,hue='Sub-Category')

# 16. Now, Grouping or sum the sales ,profit,discount,quantity according to every state of region and also according to sub-categories sales

In [None]:
grouped=pd.DataFrame(df.groupby(['Ship Mode','Segment','Category','Sub-Category','State','Region'])
                     ['Quantity','Discount','Sales','Profit'].sum().reset_index())
grouped

# 17.sum,mean,min,max,count median,standard deviation,Variance of each states of Profit

In [None]:
# finding sum,mean,max,min,count,median,std deviation,variance of profit on each state
df.groupby("State").Profit.agg(["sum","mean","min","max","count","median","std","var"])

# 18.K means Clustering

In [None]:
# selected all rows but last 4 columns
x = df.iloc[:, [9, 10, 11, 12]].values
# unsupervised machine learning algorithms that forms clusters of data based on the similarity between data instances
# number of clusters has to be defined beforehand
# K in the K-means refers to the number of clusters.
from sklearn.cluster import KMeans
# within cluster sum of squares
# For each value of K, we are calculating WCSS ( Within-Cluster Sum of Square )
# WCSS is the sum of squared distance between each point and the centroid in a cluster
#  When we plot the WCSS with the K value, the plot looks like an Elbow
# number of clusters increases, the WCSS value will start to decrease
wcss = []
# To get the values used in the graph, we train multiple models using a different number of clusters 
# and storing the value of the intertia_ property (WCSS) every time.
# k-means++ ensures that you get don’t fall into the random initialization trap.
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300, n_init = 10, random_state = 0).fit(x)
    wcss.append(kmeans.inertia_)
# types: whitegrid,darkgrid,dark,ticks,white
sns.set_style("darkgrid")
# maps a dataset onto multiple axes arrayed in a grid of rows and columns that correspond to levels of variables in the dataset.
sns.FacetGrid(df, hue ="Sub-Category",height = 6).map(plt.scatter,'Sales','Quantity')
# centroids are shown using yellow
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 100, c = 'black', label = 'Centroids')
plt.legend()

In [None]:
sns.pairplot(df1)

In [None]:
#  depicting groups of numerical data through their quartiles
# used for detect the outlier in data set
#A box plot consist of 5 things.

# a)Minimum
# b)First Quartile or 25%
# c)Median (Second Quartile) or 50%
# d)Third Quartile or 75%
# e)Maximum
fig, axes = plt.subplots(figsize = (10 , 10))
sns.boxplot(df['Sales'])

In [None]:
fig, axes = plt.subplots(figsize = (10 , 10))

sns.boxplot(df['Discount'])

In [None]:
fig, axes = plt.subplots(figsize = (10 , 10))

sns.boxplot(df['Profit'])

In [None]:
# quantile determines how many values in a distribution are above or below a certain limit.
Q1 = df.quantile(q = 0.25, axis = 0, numeric_only = True, interpolation = 'linear')
Q3 = df.quantile(q = 0.75, axis = 0, numeric_only = True, interpolation = 'linear')
IQR = Q3 - Q1

print(IQR)

# 19. Plotting scatterplot

In [None]:
# values of two variables are plotted along two axes, the pattern of the resulting points revealing any correlation present.
fig, ax = plt.subplots(figsize = (10 , 6))
ax.scatter(df["Sales"] , df["Profit"])
ax.set_xlabel('Sales')
ax.set_ylabel('Profit')
plt.show()

# 20. Distribution plot

In [None]:
# used for examining univariate and bivariate distributions.
print(df['Sales'].describe())
plt.figure(figsize = (9 , 8))
sns.distplot(df['Sales'], color = 'b', bins = 100, hist_kws = {'alpha': 0.4});