## **TRANSFORMING THE DATA FOR COMPATIBILITY AND CONSISTENCY**

In [135]:
import pandas as pd 
import numpy as np

### **Importing a CSV File using read_csv() Method :**

In [136]:
sales_2017 = pd.read_csv(r"C:\Users\nares\Desktop\PCleaning Datasets\sales2017_uncleaned.csv",skiprows=1)
sales_2017.head()

Unnamed: 0,order_id (unique),product_id,column3,store_id,order_date,order_date_2,sales,revenue,stock,price,promo_type_1,promo_bin_1,promo_type_2,promo_bin_2,promo_discount_2,delivery_date_format1,delivery_date_format2
0,,,,,,,,,,,,,,,,,
1,1.0,P0258,,S0008,01-02-2017,02-Jan-17,0 sales,0.0,5.0,6.5,PR14,,PR03,,,01-05-2017,05-01-2017
2,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,
4,2.0,P0348,,S0110,01-02-2017,02-Jan-17,0 sales,0.0,9.0,2.1,PR05,verylow,PR03,,,01-04-2017,04-01-2017


### **Removing Unwanted and Redundant Columns :**

In [137]:
sales_2017.drop(['column3','order_date_2','promo_type_1','promo_bin_1','promo_type_2',
                    'promo_bin_2','promo_discount_2','delivery_date_format2'],axis=1,inplace=True
               )
               
# Using the shape method to check the number of Columns :

sales_2017.shape

(18124, 9)

### **Renaming the Columns :**

In [138]:
sales_2017 = sales_2017.rename(columns={'order_id (unique)':'order_id','delivery_date_format1':'delivery_date'})
sales_2017.columns

Index(['order_id', 'product_id', 'store_id', 'order_date', 'sales', 'revenue',
       'stock', 'price', 'delivery_date'],
      dtype='object')

### **Count of Null Values in Each and Every Column :**

In [139]:
sales_2017.isna().sum()

order_id           5
product_id         7
store_id           7
order_date         7
sales              7
revenue            7
stock              7
price            631
delivery_date      7
dtype: int64

### **Removing the Null values :**

In [140]:
sales_2017.dropna(subset= ['order_id','product_id','store_id','order_date','delivery_date'],inplace=True)
sales_2017.isna().sum()

order_id           0
product_id         0
store_id           0
order_date         0
sales              0
revenue            0
stock              0
price            624
delivery_date      0
dtype: int64

### **Filling Null Values with the Mean :**

In [141]:
sales_2017['price']     = (round(sales_2017['price'].fillna(sales_2017.groupby(['product_id'])
                                 ['price'].transform('mean')),2)
                          )

# Count of Null values :
sales_2017.dropna(subset=['price'],inplace=True)
sales_2017.isna().sum()

order_id         0
product_id       0
store_id         0
order_date       0
sales            0
revenue          0
stock            0
price            0
delivery_date    0
dtype: int64

### **Dropping the Duplicates :**

In [142]:
sales_2017.drop_duplicates()
sales_2017.shape

(18098, 9)

### **Extracting the Column :**

In [143]:
sales_2017['sales'] =sales_2017['sales'].apply(lambda x : x.split()[0])
sales_2017['sales'] = sales_2017['sales'].str.strip()
sales_2017.head(5)

Unnamed: 0,order_id,product_id,store_id,order_date,sales,revenue,stock,price,delivery_date
1,1.0,P0258,S0008,01-02-2017,0,0.0,5.0,6.5,01-05-2017
4,2.0,P0348,S0110,01-02-2017,0,0.0,9.0,2.1,01-04-2017
6,3.0,P0219,S0026,01-02-2017,5,33.56,3.0,7.25,01-05-2017
7,3.0,P0219,S0026,01-02-2017,5,33.56,3.0,7.25,01-05-2017
8,3.0,P0219,S0026,01-02-2017,5,33.56,3.0,7.25,01-05-2017


### **Updating the Datatype of Columns :**

In [144]:
sales_2017['sales']            = sales_2017['sales'].astype('float').astype('int')
sales_2017['order_id']         = sales_2017['order_id'].astype('int')
sales_2017['stock']            = sales_2017['stock'].astype('int') 
sales_2017['delivery_date']    = pd.to_datetime(sales_2017['delivery_date'])
sales_2017['order_date']       = pd.to_datetime(sales_2017['order_date'])

### **Checking the  data types of Columns :**

In [145]:
sales_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18098 entries, 1 to 18123
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   order_id       18098 non-null  int32         
 1   product_id     18098 non-null  object        
 2   store_id       18098 non-null  object        
 3   order_date     18098 non-null  datetime64[ns]
 4   sales          18098 non-null  int32         
 5   revenue        18098 non-null  float64       
 6   stock          18098 non-null  int32         
 7   price          18098 non-null  float64       
 8   delivery_date  18098 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int32(3), object(2)
memory usage: 1.2+ MB


### **Exporting the Datset :**

In [146]:
sales_2017 = sales_2017.to_csv('sales_Data.csv',header=True,index=False)
sales_2017