# **Importing Libraries**

In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# **Loading Dataset**

In [73]:
file_name = '../input/retailsuperstore/SampleSuperstore.csv'
df = pd.read_csv(file_name)

In [74]:
auto.head()

**Print Random Nine row**

In [75]:
df.sample(9)

**Print Last five row**

In [76]:
df.tail()

**Check missing values**

In [77]:
df.isnull().sum()

**Total number of null values in a dataset**

In [78]:
print("Total number of null values = ",df.isnull().sum().sum())

**Full summary of the dataframe**

In [79]:
print(df.info()) 

**Statistical details of the dataset**

In [80]:
df.describe()

 **Shape of the Dataset**

In [81]:
df.shape

**dtypes in the Dataset**

In [82]:
df.dtypes

**Column names inside the dataset**

In [83]:
df.columns

**Checking the dataset for duplicate and dropping element**

In [84]:
df.duplicated().sum()

In [85]:
df.drop_duplicates()

**Function returning Series with number of distinct observations over requested axis**

In [86]:
df.nunique()

**Correlation of dataset**

In [87]:
df.corr()

**Find the covariance of dataset**

In [88]:
df.cov()

**Series containing counts of unique values**

In [89]:
df.value_counts()

**Deleting the Specific Variable**

In [90]:
col=['Postal Code']
df1=df.drop(columns=col,axis=1)

# **Proper Visualization of the data set**

In [91]:
plt.figure(figsize=(16,8))
plt.bar('Sub-Category','Category', data=df)
plt.show()

In [92]:
print(df1['State'].value_counts())
plt.figure(figsize=(15,8))
sns.countplot(x=df1['State'])
plt.xticks(rotation=90)
plt.show()

In [93]:
print(df['Sub-Category'].value_counts())
plt.figure(figsize=(12,6))
sns.countplot(x=df['Sub-Category'])
plt.xticks(rotation=90)
plt.show()

# **HeatMap for Dataset**

In [94]:
fig,axes = plt.subplots(1,1,figsize=(9,6))
sns.heatmap(df.corr(), annot= True)
plt.show()

In [95]:
fig,axes = plt.subplots(1,1,figsize=(9,6))
sns.heatmap(df.cov(), annot= True)
plt.show()

In [96]:
sns.countplot(x=df['Segment'])

In [97]:
sns.countplot(x=df['Region'])

In [98]:
plt.figure(figsize=(40,25))
sns.barplot(x=df['Sub-Category'], y=df['Profit'])

In [99]:
plt.figure(figsize = (10,4))
sns.lineplot('Discount', 'Profit', data = df, color = 'r', label= 'Discount')
plt.legend()

In [100]:
df1.hist(bins=50 ,figsize=(20,15))
plt.show()

In [101]:
figsize=(15,10)
sns.pairplot(df1,hue='Sub-Category')

In [102]:
grouped=pd.DataFrame(df.groupby(['Ship Mode','Segment','Category','Sub-Category','State','Region'])['Quantity','Discount','Sales','Profit'].sum().reset_index())
grouped

**Sum,Mean,Min,Max,Count Median,Standard Deviation,Variance of each states of Profit**

In [103]:
df.groupby("State").Profit.agg(["sum","mean","min","max","count","median","std","var"])

In [104]:
x = df.iloc[:, [9, 10, 11, 12]].values
from sklearn.cluster import KMeans
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', 
                    max_iter = 300, n_init = 10, random_state = 0).fit(x)
    wcss.append(kmeans.inertia_)

sns.set_style("whitegrid") 
sns.FacetGrid(df, hue ="Sub-Category",height = 6).map(plt.scatter,'Sales','Quantity')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], 
             s = 100, c = 'yellow', label = 'Centroids')

plt.legend()

In [105]:
sns.pairplot(df1)

In [106]:
fig, axes = plt.subplots(figsize = (10 , 10))

sns.boxplot(df['Sales'])

In [107]:
fig, axes = plt.subplots(figsize = (10 , 10))

sns.boxplot(df['Discount'])

In [108]:
fig, axes = plt.subplots(figsize = (10 , 10))

sns.boxplot(df['Profit'])

In [109]:
Q1 = df.quantile(q = 0.25, axis = 0, numeric_only = True, interpolation = 'linear')

Q3 = df.quantile(q = 0.75, axis = 0, numeric_only = True, interpolation = 'linear')

In [110]:
IQR = Q3 - Q1

print(IQR)

In [111]:
df.value_counts().nlargest().plot(kind = 'bar' , figsize = (10 , 5))

# **Scatter Plot for the Dataset**

In [112]:
fig, ax = plt.subplots(figsize = (10 , 6))
ax.scatter(df["Sales"] , df["Profit"])
ax.set_xlabel('Sales')
ax.set_ylabel('Profit')
plt.show()

# **Distribution Plot for the Dataset**

In [113]:
print(df['Sales'].describe())
plt.figure(figsize = (9 , 8))
sns.distplot(df['Sales'], color = 'b', bins = 100, hist_kws = {'alpha': 0.4});