### Installation

In [1]:
!pip install azure-storage-blob # Microoft Azure
!pip install pyarrow
!pip install psycopg2 sqlalchemy

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.20.0-py3-none-any.whl (392 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m392.2/392.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.30.1-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.30.1 azure-storage-blob-12.20.0 isodate-0.6.1


### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar
from sqlalchemy import create_engine
from decimal import Decimal

### Download data from staging

In [3]:
# Specify the path to your JSON configuration file
config_file_path = 'config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

CONNECTION_STRING_AZURE_STORAGE = config["connectionString"]
CONTAINER_AZURE = 'commodities-contracts'

# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get the container client
container_client = blob_service_client.get_container_client(CONTAINER_AZURE)

In [70]:
df_list = []

# List all blobs in the specified container
blob_list = container_client.list_blobs()
for blob in blob_list:
    print(blob.name)
    blob_client = container_client.get_blob_client(blob=blob.name)
    blob_data = blob_client.download_blob()
    blob_content = blob_data.readall().decode('utf-8')
    df = pd.read_csv(StringIO(blob_content))
    # Display the head of the DataFrame
    print(df.shape)
    # sind I have only one csv, I am doing to do the following instructions
    df_list.append(df.copy())

contracts_raw_df = pd.concat(df_list, ignore_index=True)
print(contracts_raw_df.info())

FY2014Q1.csv
(863, 13)
FY2014Q2.csv
(885, 13)
FY2014Q3.csv
(868, 13)
FY2014Q4.csv
(859, 13)
FY2015Q1.csv
(893, 13)
FY2015Q2.csv
(907, 14)
FY2015Q3.csv
(831, 14)
FY2015Q4.csv
(822, 14)
FY2016Q1.csv
(840, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7768 entries, 0 to 7767
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Bid_Number                 7767 non-null   object 
 1   Contract_Number            7768 non-null   object 
 2   Contract_Description       7749 non-null   object 
 3   Start_Date                 7768 non-null   int64  
 4   End_Date                   6365 non-null   float64
 5   Contract_Type              7768 non-null   object 
 6   Vendor_Name                7768 non-null   object 
 7   Department_Name            7768 non-null   object 
 8   Contract_Amount            7768 non-null   object 
 9   Total_Contract_Months      6046 non-null   float64
 10  Total_Tra

## Cleaning

In [71]:
contracts_raw_df.head(10)

Unnamed: 0,Bid_Number,Contract_Number,Contract_Description,Start_Date,End_Date,Contract_Type,Vendor_Name,Department_Name,Contract_Amount,Total_Contract_Months,Total_Transactions,quarter,fisco_year,Remaining_Contract_Months
0,V3WO195C,130313,EMC CORPORATION Bid# V3WO195C,20130416,20130416.0,SSE,EMC CORPORATION,MDO-OFFICE OF TECHNOLOGY,"$34,995.62",,34995.62,1,2014,
1,S3XW929S,130315,TELEDYNE INSTRUMENTS INC Bid# S3XW929S,20130417,20130417.0,SSE,TELEDYNE INSTRUMENTS INC,WATER,"$68,604.00",,68604,1,2014,
2,4501GCOR,D11410,LITTLE BUILDERS Bid# 4501GCOR,20130529,20130528.0,PW,LITTLE BUILDERS,FIRE,"$45,701.15",,188557.31,1,2014,
3,NJPA0629,130363,TYCO INTEGRATED SECURITY LLCBid # NJPA0629,20130221,20130720.0,SSE,TYCO INTEGRATED SECURITY LLC,SS&E,1060000,,,1,2014,
4,00007313,130396,APPLE AUTOMOTIVE GROUP INCBid # 00007313,20130410,20130731.0,SSE,APPLE AUTOMOTIVE GROUP INC,SS&E,250000,,,1,2014,
5,S3610-08,B20544,XEROX CORPBid # S3610-08,20110804,20130731.0,SSE,XEROX CORP,SS&E,2506000,36.0,"$53,394.42",1,2014,
6,S6810-02,120164,INTERNATIONAL SALT COMPANY LLCBid # S6810-02,20110801,20130731.0,SSE,INTERNATIONAL SALT COMPANY LLC,SS&E,5000000,0.0,$29.41,1,2014,
7,S1Z56600,110003,# 2 Fuel Oil,20100801,20130731.0,SSE,F C HAAB COMPANY INC,SS&E,3510512,24.0,"$51,379.56",1,2014,
8,S0YL508S,100127,Maintenance for Safeport Vessel Traffic Inform...,20090801,20130731.0,SSE,OBSERVATION TECHNOLOGIES INC,SS&E,210636,36.0,,1,2014,
9,S0XL7380,100130,Scale Maintenance,20090801,20130731.0,SSE,ADVANCE SCALE COMPANY INC,SS&E,170001,36.0,"$1,537.65",1,2014,


In [77]:
contracts_raw_df = contracts_raw_df.dropna(
      subset = ['Bid_Number'],
      ignore_index = True)

# Drop data start before 2010
filter_index = contracts_raw_df[contracts_raw_df['Start_Date'].apply(lambda x: str(x)[0:4])<'2010'].index
contracts_raw_df = contracts_raw_df.drop(filter_index)

contracts_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7507 entries, 0 to 7766
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Bid_Number                 7507 non-null   object 
 1   Contract_Number            7507 non-null   object 
 2   Contract_Description       7488 non-null   object 
 3   Start_Date                 7507 non-null   int64  
 4   End_Date                   6104 non-null   float64
 5   Contract_Type              7507 non-null   object 
 6   Vendor_Name                7507 non-null   object 
 7   Department_Name            7507 non-null   object 
 8   Contract_Amount            7507 non-null   object 
 9   Total_Contract_Months      5801 non-null   float64
 10  Total_Transactions         5680 non-null   object 
 11  quarter                    7507 non-null   int64  
 12  fisco_year                 7507 non-null   int64  
 13  Remaining_Contract_Months  2574 non-null   float64
dt

In [112]:
contracts_clean_df = contracts_raw_df.copy()

## Reformatting

In [113]:
contracts_clean_df['Total_Contract_Months'] = contracts_clean_df['Total_Contract_Months'].fillna(-1)
contracts_clean_df['Total_Contract_Months'] = contracts_clean_df['Total_Contract_Months'].astype(np.int64)

contracts_clean_df['Remaining_Contract_Months'] = contracts_clean_df['Remaining_Contract_Months'].fillna(-1)
contracts_clean_df['Remaining_Contract_Months'] = contracts_clean_df['Remaining_Contract_Months'].astype(np.int64)

contracts_clean_df['Start_Date'] = contracts_clean_df['Start_Date'].apply(
    lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
contracts_clean_df['End_Date'] = contracts_clean_df['End_Date'].apply(
    lambda x: x if np.isnan(x) else datetime.datetime.strptime(str(int(x)), '%Y%m%d'))

contracts_clean_df['Contract_Amount'] = contracts_clean_df['Contract_Amount'].apply(
    lambda x: float(x.replace("$","").replace(",","")))
contracts_clean_df['Total_Transactions'] = contracts_clean_df['Total_Transactions'].apply(
    lambda x: float(str(x).replace("$","").replace(",","")))

contracts_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7507 entries, 0 to 7766
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Bid_Number                 7507 non-null   object        
 1   Contract_Number            7507 non-null   object        
 2   Contract_Description       7488 non-null   object        
 3   Start_Date                 7507 non-null   datetime64[ns]
 4   End_Date                   6104 non-null   datetime64[ns]
 5   Contract_Type              7507 non-null   object        
 6   Vendor_Name                7507 non-null   object        
 7   Department_Name            7507 non-null   object        
 8   Contract_Amount            7507 non-null   float64       
 9   Total_Contract_Months      7507 non-null   int64         
 10  Total_Transactions         5680 non-null   float64       
 11  quarter                    7507 non-null   int64         
 12  fisco_year 

## Transformation

### Create Date Dimension

In [36]:
# Create Date Dimension

# Function to get week of month
def week_of_month(dt):
    year = dt.year
    month = dt.month
    day = dt.day

    cal = calendar.monthcalendar(year, month)
    week_number = (day - 1) // 7 + 1
    return week_number

In [31]:
# Get range of date
min_start_date = contracts_clean_df['Start_Date'][~pd.isna(contracts_clean_df['Start_Date'])].min()
max_start_date = contracts_clean_df['Start_Date'][~pd.isna(contracts_clean_df['Start_Date'])].max()

min_end_date = contracts_clean_df['End_Date'][~pd.isna(contracts_clean_df['End_Date'])].min()
max_end_date = contracts_clean_df['End_Date'][~pd.isna(contracts_clean_df['End_Date'])].max()

my_start_date = min(min_start_date, min_end_date)
my_end_date = max(max_start_date, max_end_date)
print(my_start_date, my_end_date)

2010-01-01 00:00:00 2016-09-30 00:00:00


In [32]:
# Create a DataFrame for the date dimension
date_df = pd.DataFrame({'date': pd.date_range(my_start_date, my_end_date, freq='D')})

date_df.head(15)

Unnamed: 0,date
0,2010-01-01
1,2010-01-02
2,2010-01-03
3,2010-01-04
4,2010-01-05
5,2010-01-06
6,2010-01-07
7,2010-01-08
8,2010-01-09
9,2010-01-10


In [40]:
# Extract attributes
date_df['year'] = date_df['date'].dt.year
date_df['quarter'] = date_df['date'].dt.quarter
date_df['month'] = date_df['date'].dt.month
date_df['day'] = date_df['date'].dt.day
date_df['month_name'] = date_df['date'].dt.strftime('%B')
date_df['day_name'] = date_df['date'].dt.strftime('%A')
# date_df['date_iso_format'] = date_df['date'].apply(lambda x: x.isoformat())
date_df['date_id'] = date_df['date'].dt.strftime('%Y%m%d%H')

# Add week of the month and week of the year
date_df['week_of_month'] = date_df['date'].apply(week_of_month)
date_df['week_of_year'] = date_df['date'].dt.strftime('%U')

# Reorder
new_order = ['date_id','date','year','quarter','month','day','month_name','day_name','week_of_month','week_of_year']
date_df = date_df[new_order]

date_df.head(15)

Unnamed: 0,date_id,date,year,quarter,month,day,month_name,day_name,week_of_month,week_of_year
0,2010010100,2010-01-01,2010,1,1,1,January,Friday,1,0
1,2010010200,2010-01-02,2010,1,1,2,January,Saturday,1,0
2,2010010300,2010-01-03,2010,1,1,3,January,Sunday,1,1
3,2010010400,2010-01-04,2010,1,1,4,January,Monday,1,1
4,2010010500,2010-01-05,2010,1,1,5,January,Tuesday,1,1
5,2010010600,2010-01-06,2010,1,1,6,January,Wednesday,1,1
6,2010010700,2010-01-07,2010,1,1,7,January,Thursday,1,1
7,2010010800,2010-01-08,2010,1,1,8,January,Friday,2,1
8,2010010900,2010-01-09,2010,1,1,9,January,Saturday,2,1
9,2010011000,2010-01-10,2010,1,1,10,January,Sunday,2,2


### Create Contract Dimension

In [64]:
contract_df = contracts_clean_df.loc[:, ['Contract_Number','Bid_Number','Contract_Description']].copy()
contract_df.drop_duplicates(subset=['Contract_Number','Bid_Number'],keep='last',inplace = True)
contract_df.head()

Unnamed: 0,Contract_Number,Bid_Number,Contract_Description
1,120164,S6810-02,INTERNATIONAL SALT COMPANY LLCBid # S6810-02
9,130069,S3YQ0520,Inspection and Repair to Airport Crash Rescue ...
11,130286,00010144,DAY FORD INCBid # 00010144
12,130287,00010145,HERTRICH FLEET SERVICES INCBid # 00010145
13,130288,00010456,HERTRICH FLEET SERVICES INCBid # 00010456


### Create Contract Type Dimension

In [121]:
# Mapping dictionary
contract_type_mapping = {
    'PW': 1,  # Public Works contract
    'SSE': 2, # supplies, equipment, and non-professional services
}

unique_types = contracts_clean_df['Contract_Type'].unique()
# Converting the array of unique values into a DataFrame
contract_type_df = pd.DataFrame(unique_types, columns=['contract_type'])

# Applying the mapping to create a new column with descriptions
contract_type_df['contract_type_id'] = contract_type_df['contract_type'].map(contract_type_mapping)
# contract_type_df = contract_type_df[contract_type_df['contract_type'] != '<NA>']
contract_type_df

Unnamed: 0,contract_type,contract_type_id
0,SSE,2
1,PW,1


In [122]:
new_order = ['contract_type_id','contract_type']
contract_type_df = contract_type_df[new_order]
contract_type_df

Unnamed: 0,contract_type_id,contract_type
0,2,SSE
1,1,PW


### Create Department Dimension

In [125]:
unique_departments = contracts_clean_df['Department_Name'].unique()
# Converting the array of unique values into a DataFrame
department_df = pd.DataFrame(unique_departments, columns=['department_name'])

department_df['department_id'] = range(1, len(department_df) + 1)

new_order = ['department_id','department_name']
department_df = department_df[new_order]
department_df

Unnamed: 0,department_id,department_name
0,1,MDO-OFFICE OF TECHNOLOGY
1,2,WATER
2,3,FIRE
3,4,SS&E
4,5,RECREATION
5,6,STREETS
6,7,FLEET MANAGEMENT
7,8,PRISONS
8,9,COMMERCE
9,10,PUBLIC PROPERTY


### Create Vendor Dimension

In [56]:
contracts_clean_df.columns

Index(['Bid_Number', 'Contract_Number', 'Contract_Description', 'Start_Date',
       'End_Date', 'Contract_Type', 'Vendor_Name', 'Department_Name',
       'Contract_Amount', 'Total_Contract_Months', 'Total_Transactions',
       'quarter', 'fisco_year', 'Remaining_Contract_Months'],
      dtype='object')

In [126]:
unique_vendors = contracts_clean_df['Vendor_Name'].unique()
# Converting the array of unique values into a DataFrame
vendor_df = pd.DataFrame(unique_vendors, columns=['vendor_name'])

vendor_df['vendor_id'] = range(1, len(vendor_df) + 1)

new_order = ['vendor_id','vendor_name']
vendor_df = vendor_df[new_order]
vendor_df

Unnamed: 0,vendor_id,vendor_name
0,1,EMC CORPORATION
1,2,TELEDYNE INSTRUMENTS INC
2,3,LITTLE BUILDERS
3,4,TYCO INTEGRATED SECURITY LLC
4,5,APPLE AUTOMOTIVE GROUP INC
...,...,...
678,679,MULTI MEASUREMENTS INC
679,680,JOHNSON CONTROLS INC
680,681,WILLIAM BETZ JR INC
681,682,DEMOUNTABLE CONCEPTS INC


### Create Fact Table