In [1]:
import pandas as pd 
import numpy as np

In [2]:
#Importing a CSV File using read_csv() method

sales_2017        = pd.read_csv(r"C:\Users\nares\Desktop\SQL & Pandas\sales2017_uncleaned.csv")
sales_2018        = pd.read_csv(r"C:\Users\nares\Desktop\SQL & Pandas\sales2018.csv")
sales_2019        = pd.read_csv(r"C:\Users\nares\Desktop\SQL & Pandas\sales2019.csv")
product_details   = pd.read_csv(r"C:\Users\nares\Desktop\SQL & Pandas\product_details.csv")
store_cities      = pd.read_csv(r"C:\Users\nares\Desktop\SQL & Pandas\store_cities.csv")

In [3]:
#Concatenating all the sales Files 

sales_2017_2019 = pd.concat((sales_2017,sales_2018,sales_2019))
sales_2017_2019.shape

(88158, 17)

In [4]:
                            '''TRANSFORMING THE DATA FOR COMPATIBILITY AND CONSISTENCY'''

#Dropping the Unwanted and Redundant Columns:
sales_2017_2019.drop(['column3','order_date_2','promo_type_1','promo_bin_1','promo_type_2',
                      'promo_bin_2','promo_discount_2','delivery_date_format2'],axis=1,inplace=True)

sales_2017_2019.shape

(88158, 9)

In [5]:
#Dropping the Duplicates
sales_2017_2019.drop_duplicates(inplace=True)
sales_2017_2019.shape

(88152, 9)

In [6]:
#Count of null values in each and every column 
sales_2017_2019.isna().sum()

order_id                    1
product_id                  3
store_id                    3
order_date                  3
sales                    8133
revenue                  8133
stock                    8133
price                    2117
delivery_date_format1       3
dtype: int64

In [7]:
#Renaming the Columns :
sales_2017_2019 = sales_2017_2019.rename(columns ={'delivery_date_format1':'delivery_date'})

In [8]:
#Removing the Null values :
sales_2017_2019.dropna(subset= ['order_id','product_id','store_id','order_date','delivery_date'],inplace=True)

In [9]:
#Filling Null Values with the Mean :
sales = sales_2017_2019 
sales['revenue']   = round(sales['revenue'].fillna(sales.groupby(['product_id'])['revenue'].transform('mean')),2)
sales['price']     = round(sales['price'].fillna(sales.groupby(['product_id'])['price'].transform('mean')),2)
sales['stock']     = round(sales['stock'].fillna(sales.groupby(['store_id'])['stock'].transform('mean')),2) 

#Dropping the Null values after filling with the Mean:
sales.dropna(subset=['stock','price'],inplace =True)
sales.isna().sum()

order_id            0
product_id          0
store_id            0
order_date          0
sales            8104
revenue           342
stock               0
price               0
delivery_date       0
dtype: int64

In [10]:
#Trimming the Columns using Strip() And Filling the null values using Interpolate Method:
sales['sales'] = sales['sales'].str.strip('sales').interpolate(method='linear',direction='forward').fillna(0)

In [11]:
#Updating the Datatype of Columns:
sales['sales']          = sales['sales'].astype('float').astype('int')
sales['order_id']       = sales['order_id'].astype('int')
sales['stock']          = sales['stock'].astype('int') 
sales['delivery_date']  = pd.to_datetime(sales['delivery_date'])
sales['order_date']     = pd.to_datetime(sales['order_date'])

In [12]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87993 entries, 1 to 47238
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_id       87993 non-null  int32         
 1   product_id     87993 non-null  object        
 2   store_id       87993 non-null  object        
 3   order_date     87993 non-null  datetime64[ns]
 4   sales          87993 non-null  int32         
 5   revenue        87651 non-null  float64       
 6   stock          87993 non-null  int32         
 7   price          87993 non-null  float64       
 8   delivery_date  87993 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int32(3), object(2)
memory usage: 5.7+ MB


In [13]:
#Returning First 5 Rows using head() :
sales.head()

Unnamed: 0,order_id,product_id,store_id,order_date,sales,revenue,stock,price,delivery_date
1,1,P0258,S0008,2017-01-02,0,0.0,5,6.5,2017-01-05
4,2,P0348,S0110,2017-01-02,0,0.0,9,2.1,2017-01-04
6,3,P0219,S0026,2017-01-02,5,33.56,3,7.25,2017-01-05
10,4,P0218,S0051,2017-01-02,0,0.0,1,59.9,2017-01-04
11,5,P0660,S0056,2017-01-02,1,4.58,5,4.95,2017-01-04


In [14]:
#Statiscal information of Dataframe sales
sales.describe()

Unnamed: 0,order_id,sales,revenue,stock,price
count,87993.0,87993.0,87651.0,87993.0,87993.0
mean,44034.816258,0.101542,2.166738,15.974055,16.741538
std,25444.51502,1.137781,17.120361,46.612896,34.933354
min,1.0,0.0,0.0,0.0,0.01
25%,22000.0,0.0,0.0,4.0,3.5
50%,43997.0,0.0,0.0,9.0,8.5
75%,66073.0,0.0,0.0,17.0,17.99
max,88149.0,178.0,2497.96,3993.0,1549.0


In [15]:
#Merging the Dataframes
sales_product_details = pd.merge(sales,product_details,how='inner',on='product_id')
sales                 = pd.merge(sales_product_details,store_cities,how='inner',on='store_id')
sales.head()

Unnamed: 0,order_id,product_id,store_id,order_date,sales,revenue,stock,price,delivery_date,product,type,category,sub_category,state,city
0,1,P0258,S0008,2017-01-02,0,0.0,5,6.5,2017-01-05,Sunfeast,Shower Gel & Body Wash,Beauty & Hygiene,Bath & Hand Wash,Oklahoma,Oklahoma City
1,1625,P0258,S0008,2017-02-08,0,0.0,10,6.5,2017-02-10,Sunfeast,Shower Gel & Body Wash,Beauty & Hygiene,Bath & Hand Wash,Oklahoma,Oklahoma City
2,16788,P0258,S0008,2017-12-06,0,0.0,3,7.1,2017-12-08,Sunfeast,Shower Gel & Body Wash,Beauty & Hygiene,Bath & Hand Wash,Oklahoma,Oklahoma City
3,7141,P0348,S0008,2017-06-06,0,0.0,14,2.15,2017-06-08,Kohinoor,Herbs & Seasoning,"Foodgrains, Oil & Masala",Masalas & Spices,Oklahoma,Oklahoma City
4,9577,P0348,S0008,2017-07-26,0,0.0,11,2.3,2017-07-29,Kohinoor,Herbs & Seasoning,"Foodgrains, Oil & Masala",Masalas & Spices,Oklahoma,Oklahoma City


In [16]:
#Exporting a Datset into CSV File
sales = sales.to_csv('sales_Data.csv',header=True,index=False)
sales