In [32]:
import pandas as pd
import numpy as np

In [33]:
class DataPreprocessing:
    def __init__(self):
        # Initializing Datapreprocessing
        print("---Initializing Data Preprocessing---")
        
    def dataco_cleaning(self,df):
        print(f"Raw Data shape: {df.shape}")
        
        
        # Remove Duplicate Data
        len_of_raw_data = len(df) 
        df = df.drop_duplicates()
        print(f'Dropped {len_of_raw_data - len(df)} duplicates.')
        
        # Handling Missing Values
        critical_cols = ['Order Id','Customer Id','Product Card Id']
        df = df.dropna(subset = critical_cols)
        
        
        # Filling missing Zipcode value with null
        if 'Customer Zipcode' in df.columns:
            df['Customer Zipcode'] = df['Customer Zipcode'].fillna(0)
            
        print(f'Shape after cleaning {df.shape}')
        
        return df
    
    def normalize_text(self, df):
        
        text_cols= ['Customer City','Customer Country', 'Customer Fname', 'Customer Lname', 'Product Name', 'Market']
        
        for col in text_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).str.title().str.strip()
                
        return df
    
    def enrich_profitablity(self, df):
        def classify(profit):
            if profit > 0: return 'Positive'
            elif profit < 0: return 'Negative'
            else: return 'Neutral'
            
        df['Order_Profitability_Classification'] = df['Benefit per order'].apply(classify)
        print("  Enriched 'Order_Profitability_Classification' (P/N/N).")
        return df
    
    def log_data_cleaning(self, df):
        print(f'Raw Logs Shape: {df.shape}')
        
        # droping duplicates and handling missing value
        df = df.drop_duplicates()
        df = df.dropna(subset = ['Date','Product'])
        
        # Normalize product name in logs
        df['Product'] = df['Product'].astype(str).str.title().str.strip()
        
        # Parsing Date
        df['Date'] = pd.to_datetime(df['Date'],errors='coerce')
        # Droping invalid date
        df = df.dropna(subset=['Date'])
        
        print(f'Logs shape: {df.shape}')
        
        return df
        
    def process_dataco(self, file_path):
        print(f'Processing DataCo Datasets: {file_path}')
        
        try:
            df = pd.read_csv(file_path,encoding='utf-8')
        except:
            df = pd.read_csv(file_path,encoding='latin1')
        
        print(df.head())
        
        
        df = self.dataco_cleaning(df)
        df = self.normalize_text(df)
        df = self.enrich_profitablity(df)
        
        # Cleaning Column Names(removing spaces,dots)
        df.columns = [c.replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_') for c in df.columns]
        
        return df
    
    def process_log_data(self, file_path):
        print(f'Processing Logs Datasets: {file_path}')
        
        try:
            df = pd.read_csv(file_path,encoding='utf-8')
        except:
            df = pd.read_csv(file_path,encoding='latin1')
        
        print(df.head())
        
        
        df = self.log_data_cleaning(df)
        df = self.normalize_text(df)
        
        return df
    
    def save_data(self, df, save_path):
        df.to_csv(save_path, index=False)
        print(f"--- Dataset successfully saved to: {save_path} ---")

In [34]:
if __name__ == '__main__':
    processor = DataPreprocessing()
    
    path_dataco = r"C:\Users\pawan\Desktop\AI_EG_EI\Datasets\DataCoSupplyChainDataset\DataCoSupplyChainDataset.csv"
    
    path_log = r'C:\Users\pawan\Desktop\AI_EG_EI\Datasets\DataCoSupplyChainDataset\tokenized_access_logs.csv'
    
    
    # Cleaning DataCo and saving the processed data
    cleaned_dataco = processor.process_dataco(path_dataco)
    if cleaned_dataco is not None:
        save_path_dataco = r'C:\Users\pawan\Desktop\AI_EG_EI\Processed_Datasets\Processed_DataCoSupplyChain.csv'
        processor.save_data(cleaned_dataco, save_path_dataco)
        
        
    # Cleaning tokenized access log data and saving it
    cleaned_logs = processor.process_log_data(path_log)
    if cleaned_logs is not None:
        save_path_logs = r'C:\Users\pawan\Desktop\AI_EG_EI\Processed_Datasets\Processed_TokenizedAccessLogs.csv'
        processor.save_data(cleaned_logs, save_path_logs)

---Initializing Data Preprocessing---
Processing DataCo Datasets: C:\Users\pawan\Desktop\AI_EG_EI\Datasets\DataCoSupplyChainDataset\DataCoSupplyChainDataset.csv
       Type  Days for shipping (real)  Days for shipment (scheduled)  \
0     DEBIT                         3                              4   
1  TRANSFER                         5                              4   
2      CASH                         4                              4   
3     DEBIT                         3                              4   
4   PAYMENT                         2                              4   

   Benefit per order  Sales per customer   Delivery Status  \
0          91.250000          314.640015  Advance shipping   
1        -249.089996          311.359985     Late delivery   
2        -247.779999          309.720001  Shipping on time   
3          22.860001          304.809998  Advance shipping   
4         134.210007          298.250000  Advance shipping   

   Late_delivery_risk  Category I

In [30]:
cleaned_dataco.head()

Unnamed: 0,Type,Days_for_shipping_real,Days_for_shipment_scheduled,Benefit_per_order,Sales_per_customer,Delivery_Status,Late_delivery_risk,Category_Id,Category_Name,Customer_City,...,Product_Card_Id,Product_Category_Id,Product_Description,Product_Image,Product_Name,Product_Price,Product_Status,shipping_date_DateOrders,Shipping_Mode,Order_Profitability_Classification
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,1360,73,,http://images.acmesports.sports/Smart+watch,Smart Watch,327.75,0,2/3/2018 22:56,Standard Class,Positive
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,1360,73,,http://images.acmesports.sports/Smart+watch,Smart Watch,327.75,0,1/18/2018 12:27,Standard Class,Negative
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,1360,73,,http://images.acmesports.sports/Smart+watch,Smart Watch,327.75,0,1/17/2018 12:06,Standard Class,Negative
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,1360,73,,http://images.acmesports.sports/Smart+watch,Smart Watch,327.75,0,1/16/2018 11:45,Standard Class,Positive
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,1360,73,,http://images.acmesports.sports/Smart+watch,Smart Watch,327.75,0,1/15/2018 11:24,Standard Class,Positive


In [31]:
cleaned_logs.head()

Unnamed: 0,Product,Category,Date,Month,Hour,Department,ip,url
0,Adidas Brazuca 2017 Official Match Ball,baseball & softball,2017-09-01 06:00:00,Sep,6,fitness,37.97.182.65,/department/fitness/category/baseball%20&%20so...
1,The North Face Women'S Recon Backpack,hunting & shooting,2017-09-01 06:00:00,Sep,6,fan shop,206.56.112.1,/department/fan%20shop/category/hunting%20&%20...
2,Adidas Kids' Rg Iii Mid Football Cleat,featured shops,2017-09-01 06:00:00,Sep,6,apparel,215.143.180.0,/department/apparel/category/featured%20shops/...
3,Under Armour Men'S Compression Ev Sl Slide,electronics,2017-09-01 06:00:00,Sep,6,footwear,206.56.112.1,/department/footwear/category/electronics/prod...
4,Pelican Sunstream 100 Kayak,water sports,2017-09-01 06:01:00,Sep,6,fan shop,136.108.56.242,/department/fan%20shop/category/water%20sports...
