### Installation

In [1]:
!pip install azure-storage-blob # Microoft Azure
!pip install pyarrow
!pip install psycopg2 sqlalchemy

Collecting azure-storage-blob
  Downloading azure_storage_blob-12.20.0-py3-none-any.whl (392 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m392.2/392.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core>=1.28.0 (from azure-storage-blob)
  Downloading azure_core-1.30.1-py3-none-any.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.4/193.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting isodate>=0.6.1 (from azure-storage-blob)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, azure-core, azure-storage-blob
Successfully installed azure-core-1.30.1 azure-storage-blob-12.20.0 isodate-0.6.1


### Import Libraries

In [19]:
import pandas as pd
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar
from sqlalchemy import create_engine
from decimal import Decimal

### Download data from staging

In [9]:
# Specify the path to your JSON configuration file
config_file_path = 'config.json'

# Load the JSON configuration file
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file)

CONNECTION_STRING_AZURE_STORAGE = config["connectionString"]
CONTAINER_AZURE = 'commodities-contracts'

# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING_AZURE_STORAGE)

# Get the container client
container_client = blob_service_client.get_container_client(CONTAINER_AZURE)

In [53]:
df_list = []

# List all blobs in the specified container
blob_list = container_client.list_blobs()
for blob in blob_list:
    print(blob.name)
    blob_client = container_client.get_blob_client(blob=blob.name)
    blob_data = blob_client.download_blob()
    blob_content = blob_data.readall().decode('utf-8')
    df = pd.read_csv(StringIO(blob_content))
    # Display the head of the DataFrame
    print(df.shape)
    # sind I have only one csv, I am doing to do the following instructions
    df_list.append(df.copy())

contracts_raw_df = pd.concat(df_list, ignore_index=True)
print(contracts_raw_df.info())

FY2014Q1.csv
(863, 13)
FY2014Q2.csv
(885, 13)
FY2014Q3.csv
(868, 13)
FY2014Q4.csv
(859, 13)
FY2015Q1.csv
(893, 13)
FY2015Q2.csv
(907, 14)
FY2015Q3.csv
(831, 14)
FY2015Q4.csv
(822, 14)
FY2016Q1.csv
(840, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7768 entries, 0 to 7767
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Bid_Number                 7767 non-null   object 
 1   Contract_Number            7768 non-null   object 
 2   Contract_Description       7749 non-null   object 
 3   Start_Date                 7768 non-null   int64  
 4   End_Date                   6365 non-null   float64
 5   Contract_Type              7768 non-null   object 
 6   Vendor_Name                7768 non-null   object 
 7   Department_Name            7768 non-null   object 
 8   Contract_Amount            7768 non-null   object 
 9   Total_Contract_Months      6046 non-null   float64
 10  Total_Tra

## Cleaning

In [54]:
contracts_raw_df.head()

Unnamed: 0,Bid_Number,Contract_Number,Contract_Description,Start_Date,End_Date,Contract_Type,Vendor_Name,Department_Name,Contract_Amount,Total_Contract_Months,Total_Transactions,quarter,fisco_year,Remaining_Contract_Months
0,V3WO195C,130313,EMC CORPORATION Bid# V3WO195C,20130416,20130416.0,SSE,EMC CORPORATION,MDO-OFFICE OF TECHNOLOGY,"$34,995.62",,34995.62,1,2014,
1,S3XW929S,130315,TELEDYNE INSTRUMENTS INC Bid# S3XW929S,20130417,20130417.0,SSE,TELEDYNE INSTRUMENTS INC,WATER,"$68,604.00",,68604.0,1,2014,
2,4501GCOR,D11410,LITTLE BUILDERS Bid# 4501GCOR,20130529,20130528.0,PW,LITTLE BUILDERS,FIRE,"$45,701.15",,188557.31,1,2014,
3,NJPA0629,130363,TYCO INTEGRATED SECURITY LLCBid # NJPA0629,20130221,20130720.0,SSE,TYCO INTEGRATED SECURITY LLC,SS&E,1060000,,,1,2014,
4,00007313,130396,APPLE AUTOMOTIVE GROUP INCBid # 00007313,20130410,20130731.0,SSE,APPLE AUTOMOTIVE GROUP INC,SS&E,250000,,,1,2014,


In [55]:
contracts_clean_df = contracts_raw_df.dropna(
      subset = ['Bid_Number', 'End_Date', 'Total_Contract_Months'],
      ignore_index = True)

contracts_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6046 entries, 0 to 6045
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Bid_Number                 6046 non-null   object 
 1   Contract_Number            6046 non-null   object 
 2   Contract_Description       6027 non-null   object 
 3   Start_Date                 6046 non-null   int64  
 4   End_Date                   6046 non-null   float64
 5   Contract_Type              6046 non-null   object 
 6   Vendor_Name                6046 non-null   object 
 7   Department_Name            6046 non-null   object 
 8   Contract_Amount            6046 non-null   object 
 9   Total_Contract_Months      6046 non-null   float64
 10  Total_Transactions         4276 non-null   object 
 11  quarter                    6046 non-null   int64  
 12  fisco_year                 6046 non-null   int64  
 13  Remaining_Contract_Months  2614 non-null   float

## Reformatting

In [56]:
contracts_clean_df['Total_Contract_Months'] = contracts_clean_df['Total_Contract_Months'].astype(np.int64)
# contracts_df['Remaining_Contract_Months'] = contracts_df['Remaining_Contract_Months'].astype(np.int64)
contracts_clean_df['Start_Date'] = contracts_clean_df['Start_Date'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
contracts_clean_df['End_Date'] = contracts_clean_df['End_Date'].apply(lambda x: datetime.datetime.strptime(str(int(x)), '%Y%m%d'))
contracts_clean_df['Contract_Amount'] = contracts_clean_df['Contract_Amount'].apply(
    lambda x: float(x.replace("$","").replace(",","")))
contracts_clean_df['Total_Transactions'] = contracts_clean_df['Total_Transactions'].apply(
    lambda x: float(str(x).replace("$","").replace(",","")))

contracts_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6046 entries, 0 to 6045
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Bid_Number                 6046 non-null   object        
 1   Contract_Number            6046 non-null   object        
 2   Contract_Description       6027 non-null   object        
 3   Start_Date                 6046 non-null   datetime64[ns]
 4   End_Date                   6046 non-null   datetime64[ns]
 5   Contract_Type              6046 non-null   object        
 6   Vendor_Name                6046 non-null   object        
 7   Department_Name            6046 non-null   object        
 8   Contract_Amount            6046 non-null   float64       
 9   Total_Contract_Months      6046 non-null   int64         
 10  Total_Transactions         4276 non-null   float64       
 11  quarter                    6046 non-null   int64         
 12  fisco_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contracts_clean_df['Total_Contract_Months'] = contracts_clean_df['Total_Contract_Months'].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  contracts_clean_df['Start_Date'] = contracts_clean_df['Start_Date'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r