## Data Wrangler Pipeline

In [1]:
import os
import pandas as pd
import numpy as np
import glob
from dotenv import load_dotenv 
from data_cleaner import InvoiceCleaner,SLACleaner, pre_processor, LocationCor
from db_manager import DB_Manager, DataFrameToSQL

load_dotenv("SLA DP envionment file.env")  #loading the .env file

raw_data_folder_path = os.getenv('raw_data')
processed_data_folder_path = os.getenv('processed_data')

# Ensuring the processed folder exist
os.makedirs(processed_data_folder_path, exist_ok=True)

#Pre-processing the data

files_to_preprocessor = [file for file in glob.glob(f"{raw_data_folder_path}/*.xlsx") if not file.endswith('pre_processed.xlsx')]
pre_processor(files_to_preprocessor).modified_dfs()

#Picking files to be taken to the main processor
excel_files = [file for file in glob.glob(f"{raw_data_folder_path}/*.xlsx") if file.endswith('pre_processed.xlsx')]
# Empty List to be used to append the related dfs
invoice_dfs = []
sla_dfs = []

"""
Function to rename the Sheet to either Invoice or SLA
""" 
def sla_invoice(sn):
    if "sla" in sn.lower():
        return "SLA"
    elif "invoice" in sn.lower():
        return "Invoice"
    else:
        return f"Unknown"
    

'''
This Loops parse the workbook and extracts each sheet 
'''

for excel_file in excel_files:
    xls = pd.ExcelFile(excel_file)
    workbook_name = os.path.splitext(os.path.basename(excel_file))[0]


    for sheet_name in xls.sheet_names:
        df = pd.read_excel(excel_file, sheet_name=sheet_name)

        # Determine the type of sheet for processing
        sheet_type = sla_invoice(sheet_name)

        # New files Name
        def sp_modifier(sp):
            sp_new_names = {
                'SAFARICOM':'Saf',
                'LTK': 'LTK'
            }
            return sp_new_names[sp] if sp in sp_new_names.keys() else sp
        w_b = sp_modifier(workbook_name.split(" ")[0])
        
        new_file_name = f"{w_b}_{sheet_type}.csv"
        new_file_path = os.path.join(processed_data_folder_path, new_file_name)


        '''
        The Extracted Sheet are then processed.
        Processing of each is depende on the Type Invoice or SLA
        '''
        
        # Apply different processing based on sheet type
        if sheet_type == "SLA":

            # Specific processing for SLA sheets            
            df_processed = SLACleaner(df,w_b).clean_data()  # Processing the SLA data       

            #Adding the Service Provider Col
            df_processed['Service_Provider'] = w_b


            #Adding lat and lon into our data
            df_processed = LocationCor(df_processed).geo_cor()

        
            # Creating a list of SLA Data Processed            
            df_sla = df_processed.copy()
            sla_dfs.append(df_sla)
        elif sheet_type == "Invoice":

            # Specific processing for Invoice sheets            
            df_processed = InvoiceCleaner(df, w_b).clean_data()  ## Processing the Invoice data

            #Adding the Service Provider Col
            df_processed['Service_Provider'] = w_b

            # Creating a list of all the Invoice data processed
            df_invoice = df_processed.copy()
            invoice_dfs.append(df_invoice)
        else:
            # Default processing or skip
            df_processed = df  # No processing that take place if the sheet is neither SLA nor Invoice
        
        '''
        After Processing the sheet are then saved to local Machine as CSV.
        Also the identical sheets i.e Invoice's and SLA's are merged into one for easier visulaization and other Analytics'''

        # Save the processed DataFrame to a new CSV file
        df_processed.to_csv(new_file_path, index=False)
        print(f"Processed and Saved: {new_file_path}")



##Combining the dfs into 1. combined_SLA and 2. combined_invoice
# Appending the SLA and Invoice dfs
SLA_Combined_df = pd.concat(sla_dfs)
Invoice_Combined_df = pd.concat(invoice_dfs)



# Saving them to the Processed Folder
#SLA
SLA_Combined_path = f'{processed_data_folder_path}\\SLA_Combined.csv'
SLA_Combined_df.to_csv(SLA_Combined_path, index= False)
print(f'SLA Combined Saved to {SLA_Combined_path}')

#Invoice
Invoice_Combined_path = f'{processed_data_folder_path}\\Invoice_Combined.csv'
Invoice_Combined_df.to_csv(Invoice_Combined_path, index= False)
print(f'Invoice Combined Saved to {Invoice_Combined_path}')




'''
In this section the data is prepared for saving into mysql db.
Another Merged table of Invoices and SLA is created . It will be a link  btn the 2
'''
## The Aggregated Data for Invoice and SLA

#Creating the Link_ID Col
Merged_Data_SLA_Invoice = pd.DataFrame({
    'Modified_Link_ID': pd.concat([Invoice_Combined_df['modified_Link_ID'],SLA_Combined_df['Unique_Link_Identifier_SLA']]).unique()
})


#Inserting Service Provider values
SLA_SP = dict(zip(SLA_Combined_df['Unique_Link_Identifier_SLA'], SLA_Combined_df['Service_Provider']))
invoice_SP = dict(zip(Invoice_Combined_df['modified_Link_ID'], Invoice_Combined_df['Service_Provider']))

Merged_Data_SLA_Invoice['Service_Provider'] = Merged_Data_SLA_Invoice['Modified_Link_ID'].map(SLA_SP)
Merged_Data_SLA_Invoice['Service_Provider'] = np.where(
    Merged_Data_SLA_Invoice['Service_Provider'].isnull(),
    Merged_Data_SLA_Invoice['Modified_Link_ID'].map(invoice_SP),
    Merged_Data_SLA_Invoice['Service_Provider']
)
# SLA vs Invoice Status
Merged_Data_SLA_Invoice['In_SLA_vs_Invoiced'] = np.where(
    Merged_Data_SLA_Invoice['Modified_Link_ID'].isin(SLA_Combined_df['Unique_Link_Identifier_SLA']) &
    Merged_Data_SLA_Invoice['Modified_Link_ID'].isin(Invoice_Combined_df['modified_Link_ID']),
    "In SLA and Invoiced",
    np.where(Merged_Data_SLA_Invoice['Modified_Link_ID'].isin(SLA_Combined_df['Unique_Link_Identifier_SLA']),
             "IN SLA but Not Invoiced",
             np.where(
                 Merged_Data_SLA_Invoice['Modified_Link_ID'].isin(Invoice_Combined_df['modified_Link_ID']),
                 "Invoiced but not in SLA",
                 "Unknown"
             )
        )
    
)
### Adding Invoice QRC and SLA QRC
#Getting those values
invoice_aggr = Invoice_Combined_df.groupby('modified_Link_ID').agg({'Total_QRC': 'max'}).reset_index().rename({'modified_Link_ID' : 'Modified_Link_ID'}, axis='columns')
SLA_aggr = SLA_Combined_df.groupby('Unique_Link_Identifier_SLA').agg({'QRC_Incl': 'max'}).reset_index().rename({'Unique_Link_Identifier_SLA' : 'Modified_Link_ID'}, axis='columns')


Merged_Data_SLA_Invoice['Invoice QRC'] = Merged_Data_SLA_Invoice[['Modified_Link_ID']].merge(invoice_aggr, on = 'Modified_Link_ID', how= 'left' )['Total_QRC']
Merged_Data_SLA_Invoice['SLA QRC'] = Merged_Data_SLA_Invoice[['Modified_Link_ID']].merge(SLA_aggr, on = 'Modified_Link_ID', how= 'left' )['QRC_Incl']

#### Saving them into a mysql DB
#Importing DBManger class



#Creating an Instant of DataFrameToSQL
df_to_sql = DataFrameToSQL(
    user = os.getenv('user'),
    password = os.getenv('password'),
    host = os.getenv('host'),    
    database = os.getenv('db_name')
)


#Saving SLA Data Into the DB 
SLA_table_name = 'sla_combined_table'
pk = 'Unique_Link_Identifier_SLA'
df_to_sql.save_to_mysql(df= SLA_Combined_df, 
                        table_name= SLA_table_name,
                         primary_key= pk)



#Inserting the Invoice data into the db
invoice_table_name = 'invoice_combined_table'
pk = 'Unique_Link_Identifier_Invoice'
df_to_sql.save_to_mysql(df= Invoice_Combined_df, 
                        table_name= invoice_table_name,  
                         primary_key= pk)




#Saving the aggregated data into the mysql df
aggregate_df_name = 'aggregate_df'
pk = "Modified_Link_ID"             
df_to_sql.save_to_mysql( df= Merged_Data_SLA_Invoice,
                        table_name = aggregate_df_name,
                        primary_key = pk

)



print('\n\t Imagine Imerun !!!\t\nWewe ni Mtu wa Power')


Processed and Saved: C:\Users\USER\OneDrive\KRA WORK\2024 WS\SLA EDA\SLA_ETL\Final Model\Processsed_Data\LTK_SLA.csv
Processed and Saved: C:\Users\USER\OneDrive\KRA WORK\2024 WS\SLA EDA\SLA_ETL\Final Model\Processsed_Data\LTK_Invoice.csv
Coordinates not found for location: Lwakhakha
Processed and Saved: C:\Users\USER\OneDrive\KRA WORK\2024 WS\SLA EDA\SLA_ETL\Final Model\Processsed_Data\Saf_SLA.csv
Processed and Saved: C:\Users\USER\OneDrive\KRA WORK\2024 WS\SLA EDA\SLA_ETL\Final Model\Processsed_Data\Saf_Invoice.csv
SLA Combined Saved to C:\Users\USER\OneDrive\KRA WORK\2024 WS\SLA EDA\SLA_ETL\Final Model\Processsed_Data\SLA_Combined.csv
Invoice Combined Saved to C:\Users\USER\OneDrive\KRA WORK\2024 WS\SLA EDA\SLA_ETL\Final Model\Processsed_Data\Invoice_Combined.csv
DataFrame saved to sla_combined_table table in MySQL database kra_sla_etl_project.
DataFrame saved to invoice_combined_table table in MySQL database kra_sla_etl_project.
DataFrame saved to aggregate_df table in MySQL databas

### Testing 

In [4]:
import pandas as pd
ltk_sla_df = pd.read_csv('C:\\Users\\USER\\OneDrive\\KRA WORK\\2024 WS\SLA EDA\\SLA_ETL\\Final Model\\Processsed_Data\\LTK_SLA.csv')
saf_sla_df = pd.read_csv('C:\\Users\\USER\\OneDrive\\KRA WORK\\2024 WS\SLA EDA\\SLA_ETL\\Final Model\\Processsed_Data\\Saf_SLA.csv')
combined_sla = pd.read_csv('C:\\Users\\USER\\OneDrive\\KRA WORK\\2024 WS\SLA EDA\\SLA_ETL\\Final Model\\Processsed_Data\\SLA_Combined.csv')

ltk_invoice_df = pd.read_csv('C:\\Users\\USER\\OneDrive\\KRA WORK\\2024 WS\SLA EDA\\SLA_ETL\\Final Model\\Processsed_Data\\LTK_Invoice.csv')
saf_invoice_df = pd.read_csv('C:\\Users\\USER\\OneDrive\\KRA WORK\\2024 WS\SLA EDA\\SLA_ETL\\Final Model\\Processsed_Data\\Saf_Invoice.csv')
combined_Invoice = pd.read_csv('C:\\Users\\USER\\OneDrive\\KRA WORK\\2024 WS\SLA EDA\\SLA_ETL\\Final Model\\Processsed_Data\\Invoice_Combined.csv')

In [5]:
ltk_invoice_df.head()

Unnamed: 0,Invoice_Date,Link_ID,SLA_Date,Invoice_Period,Invoice_Description,Invoice_Reference,Total_QRC,Invoice_Quarter,Invoice_Fyr,modified_Link_ID,Unique_Link_Identifier_Invoice,Service_Provider
0,2023-10-13,26046,2020-10-01,01-Oct-2023 to 31-Dec-2023,26046 National MPLS KRA MBITA TO VRF (INS-57),CRN-15473,-55986.24,Q2,Fyr 2023/24,2020-10-LTK00_26046,CRN-15473_26046_2020,LTK
1,2023-10-06,C-00159-0087,,01-Oct-2023 to 31-Dec-2023,C-00159-0087 EPL Connection charge 80MBPS->SAM...,116426,417600.0,Q2,Fyr 2023/24,0000-00-LTK01_C-00159-0087,116426_C-00159-0087_0000,LTK
2,2023-10-06,1750,2020-10-01,01-Oct-2023 to 31-Dec-2023,01750 National MPLS KRA - TIMES TOWERS TO VRF,116426,939600.0,Q2,Fyr 2023/24,2020-10-LTK00_1750,116426_1750_2020,LTK
3,2023-10-06,5611,2020-10-01,01-Oct-2023 to 31-Dec-2023,05611 National MPLS Msa link vlan 460 customs,116426,71835.9,Q2,Fyr 2023/24,2020-10-LTK00_5611,116426_5611_2020,LTK
4,2023-10-06,20103,2020-10-01,01-Oct-2023 to 31-Dec-2023,20103 Leased_Circuits_Naticnal KRA WILSON AIRP...,116426,79866.0,Q2,Fyr 2023/24,2020-10-LTK00_20103,116426_20103_2020,LTK


In [4]:
saf_invoice_df.head()

Unnamed: 0,Invoice_Data,Link_ID,Invoice_Period,Invoice_Description,Invoice_Reference,Total_QRC,Service_Provider
0,2023-10-01,95001135,1st Oct to 31st Dec 2023,CUSTOMS OFFICE MALINDI-WIMAX,B1-10096515502,240145.6128,SAFARICOM
1,2023-10-01,95001574,1st Oct to 31st Dec 2023,KRA LOITOKTOK BORDER WIMAX,B1-10096515502,240145.6128,SAFARICOM
2,2023-10-01,95001628,1st Oct to 31st Dec 2023,KRA KRATI MOMBASA WIMAX,B1-10096515502,240145.6128,SAFARICOM
3,2023-10-01,95004139,1st Oct to 31st Dec 2023,KRA - MANDERA,B1-10096515502,180078.7944,SAFARICOM
4,2023-10-01,95004140,1st Oct to 31st Dec 2023,KRA - Moyale,B1-10096515502,240145.6128,SAFARICOM


In [3]:
combined_sla.head()

Unnamed: 0,Link_ID,SLA_Date,Last_Mile,Capacity_in_Mbps,Location,MRC_Excl,SLM_Comments,QRC_Incl,SLA_ID,Service_Provider
0,1750,2020-10-01,Fibre,90,KRA - TIMES TOWERS TO KRA - KIXP (EADC),270000.0,"16/04/2022: Upgraded 90 Mbps @ KES 649,050 ->...",939600.0,LTK-SLA-(),LTK
1,5277,2020-10-01,Fibre,75,KRA CUSTOMS HSE TO KRA TIMES TOWER,179718.0,Not in Invoice/Renamed/Relocated,625418.64,LTK-SLA-(),LTK
2,5611,2020-10-01,Fibre,8,KRA AIRPORT CONTAINER DEPOT TO CUSTOMS HSE MSA,20642.5,Okay. No change,71835.9,LTK-SLA-(),LTK
3,20103,2020-10-01,Fibre,9,KRA Wilson Airport Customs to Times Towers - V...,22950.0,Okay. No change,79866.0,LTK-SLA-(),LTK
4,20954,2020-10-01,Fibre,20,KRA TIMES TOWERS NBO TO EGERTON UNIVERSITY NJO...,50000.0,Okay. No change,174000.0,LTK-SLA-(),LTK


In [5]:
combined_Invoice.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Link_ID           85 non-null     object 
 1   SLA_Date          82 non-null     object 
 2   Last_Mile         85 non-null     object 
 3   Capacity(Mbps)    85 non-null     int64  
 4   Location          83 non-null     object 
 5   MRC_Excl          85 non-null     float64
 6   SLM_Comments      47 non-null     object 
 7   QRC_Incl          85 non-null     float64
 8   SLA_ID            85 non-null     object 
 9   Service_Provider  85 non-null     object 
dtypes: float64(2), int64(1), object(7)
memory usage: 6.8+ KB
