    ==============================================================================================

## Importing the DataFrames
- ``` df = pd.read_pickle("file_name.pkl")```  --> create 1 df from a pkl file

In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.min_rows", 0) 
pd.set_option("display.max_rows", 30) 
pd.__version__

In [None]:
ord = pd.read_pickle("clean_Tables\Orders_c.pkl")
ol = pd.read_pickle("clean_Tables\OrderLines_c.pkl")
pro = pd.read_pickle("clean_Tables\Products_c.pkl")

In [None]:
ord.info()

In [None]:
ol.info()

In [None]:
pro.info()

    ==============================================================================================

## Merge DataFrames
- ``` df.shape``` , ``` df.size``` , ``` df.ndim``` 
- ``` df.sample(5)``` , ``` df.info()``` 
- Numerical : ``` df.describe()``` , ``` df.col.nlargest()``` , ``` df.col.nsmallest()``` 
- Category : ``` df.nunique()``` , ``` df.unique() ``` 

In [None]:
db = ord.copy()

In [None]:
db = db.merge(ol,how="inner",left_on="ORD_ID",right_on="ol_ORD_ID",validate="one_to_many")

In [None]:
db = db.merge(pro,how="inner",left_on="ol_Sku",right_on="PRO_SKU",validate="many_to_one")

In [None]:
pd.set_option('max_colwidth', 40)
pd.get_option('max_colwidth')

In [None]:
db.sample(5)

    ==============================================================================================

## Clean Merged Data
- Remember to create a copy of the df using ``` df.copy()``` 

In [None]:
db_original = db.copy()

      ===============================================

### Drop duplicated Columns, Reorder and Set Index
 - ``` df.columns```   , ``` df.index``` 
 - ``` df=df.rename(columns={"A": "a", "B": "c"})``` 
 - ``` df.columns = ["a","b":"x"]``` 
     - take care, renaming the columns like that will convert the NAN to some value!!
 - ``` df=df.set_index("col")```  , ``` df=df.reset_index()``` 

In [None]:
db = db.drop(columns=["ol_ORD_ID","ol_Sku","ord_State"])

In [None]:
db.info()

In [None]:
db.shape

In [None]:
db.index #hint: no need to change index

      ===============================================
      
### Clean NAN and empty cells
- ``` df.isna().sum()``` 
- check if it make sense to replace empty cells with NAN or with another value!
- ``` df = df.replace('^\s*$', np.nan)```  -->replace empty cells and cells with only whitspace with NAN
- check if other cells has the value you want to use to fill nan  ``` df.col.str.contains("pat", na=False).sum() ```
- ``` df=df.col.fillna(value,method="bfill"or"ffill",limit=value)``` 

- Extra: 
  - ``` (df.values == '').sum()```  --> check if any cell is empty
  - ``` df.col.str.isspace().sum()```  --> check if all cell is filled with whitespaces

In [None]:
db.info()

In [None]:
db.isna().sum()

In [None]:

db = db.replace('^\s*$', np.nan)
# TODO:HANA Question is it safe to do that?


In [None]:
db.isna().sum()

___ For pro_Desc   ___

In [None]:
db.pro_Desc.dtype

In [None]:
db.pro_Desc = db.pro_Desc.fillna("No Description")

In [None]:
db.isna().sum()

___ For pro_Type   ___

In [None]:
db.pro_Type.str.contains("^0+\.0+$", na=False).sum()
# No value is Zero , safely set NAN to 0

In [None]:
db.pro_Type = db.pro_Type.fillna("0")

In [None]:
db.isna().sum()

___ For pro_Pr   ___

In [None]:
db.pro_Pr.str.contains("^0+\.0+$", na=False).sum()
# No value is Zero , safely set NAN to 0

In [None]:
db.pro_Pr = db.pro_Pr.fillna("0")

In [None]:
db.isna().sum()

      ===============================================
### Fix DataTypes
- use ```df.nunique()``` to find if column is bool or Category
- before change, make sure all values will be converted correct
  - to convert to numeric : check no letters, 
      - if float(2dec) ```(~df.col.str.contains("^\d+.\d{2}$")).sum()```
      - if integer ```(~df.col.str.contains("^\d+$")).sum()```
  - to convert to category : check there are reasonable number of unique items
  - to convert to Datetime : ```(~df.col.str.contains("^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$")).sum()```
- ``` df.col.astype(type,errors="raise")``` 
  - type = "int","float","bool","category","object","datetime","timedelta"
- for mixed data
  - ``` pd.to_numeric(df.col, downcast=x,errors="raise") ``` 
  x = "integer" or "float"
  - ``` pd.to_datetime(df.col, downcast=None,errors="raise") ``` 
  - ``` pd.to_timedelta(df.col, downcast=None,errors="raise") ``` 

___ For ord_CreatDate ___

In [None]:
db.info()

In [None]:
(~db.ord_CreatDate.str.contains("^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$")).sum() 
# all rows has the datetime pattern, save to convert

In [None]:
db.ord_CreatDate=pd.to_datetime(db.ord_CreatDate,errors="raise")

In [None]:
db.info()

___ For ol_ProcessDate ___

In [None]:
(~db.ol_ProcessDate.str.contains("^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$")).sum() 
# all rows has the datetime pattern, save to convert

In [None]:
db.ol_ProcessDate=pd.to_datetime(db.ol_ProcessDate,errors="raise")

___ For pro_InStock ___

In [None]:
db.pro_InStock.unique()

In [None]:
db.pro_InStock = db.pro_InStock.astype("bool",errors="raise")

In [None]:
db.info()


___ For ol_ProdUntPr ___

In [None]:
#db_Pr = db[["ORD_ID","ord_TotlPaid","ol_ProdQnty","ol_ProdUntPr","pro_Pr","PRO_SKU","pro_Name"]].copy()


In [None]:
db.sample(5)

In [None]:
db.ol_ProdUntPr.dtype

In [None]:
str_decide="Drop it"
x= round((((db.ol_ProdUntPr.str.count('\.') > 1).sum())/db.shape[0])*100,2)
if x > 5:
    str_decide = "Clean it"

In [None]:
print(f"the percentage of 2dec ol_ProdUntPr rows is {x}%, hence {str_decide}")

In [None]:
db[(db.ol_ProdUntPr.str.count('\.') > 1)].sample(5)

In [None]:
db=(
    db
    .assign(ol_ProdUntPr = lambda x:np.where(
      (x.ol_ProdUntPr.str.count('\.') == 2)
      ,x.ol_ProdUntPr.str.replace("\.","",1,regex=True)
      ,x.ol_ProdUntPr
      ))
)

In [None]:
db.info()

In [None]:
db.ol_ProdUntPr = db.ol_ProdUntPr.astype(float)

___ For pro_Pr  ___

In [None]:
db.sample(5)

In [None]:
str_decide="Drop it"
x= round((((db.pro_Pr.str.count('\.') > 1).sum())/db.shape[0])*100,2)
if x > 5:
    str_decide = "Clean it"
print(f"the percentage of 2dec pro_Pr rows is {round(x,2)}% , Hence {str_decide}")

In [None]:
db.shape

In [None]:
db = db.loc[db.pro_Pr.str.count('\.') < 2]

In [None]:
db.shape

In [None]:
db.pro_Pr = db.pro_Pr.astype(float)

In [None]:
db.info()

    ==============================================================================================

## Re-Explore the data
draw some ``` df.col.hist()```  , ``` df.ser.boxplot()```  per column
take notes

In [None]:
db.columns

In [None]:
db.groupby(db.ord_CreatDate.dt.year).ord_CreatDate.hist()

In [None]:
db.ord_CreatDate.hist();

In [None]:
db.ord_TotlPaid.hist();

In [None]:
db.ol_ProdQnty.hist(bins=3);

In [None]:
db.ol_ProdUntPr.hist();

In [None]:
db.pro_Pr.hist();

In [None]:
db.pro_Type.hist();

In [None]:
db.columns

In [None]:
#db.groupby("ord_TotlPaid").boxplot()