In [24]:
import boto3
from datetime import datetime

def s3_uploader(
    selection = 'raw', 
    bucket = "tkh-nyc-energy",    
    file_path = '../data/processed/energy_clean.csv'
):
    current_date = datetime.now().strftime("%m%d%Y")

    if selection == 'raw':
        local_file = file_path
        name_s3_file = f"energy_raw_{current_date}.csv"
        s3 = boto3.resource('s3')
        s3.meta.client.upload_file(local_file, bucket, name_s3_file)
        print(f"Successfully uploaded {local_file} to {bucket}/{name_s3_file}")
        
    elif selection == 'cleaned':
        local_file = file_path
        name_s3_file = f"energy_cleaned_{current_date}.csv"
        s3 = boto3.resource('s3')
        s3.meta.client.upload_file(local_file, bucket, name_s3_file)
        print(f"Successfully uploaded {local_file} to {bucket}/{name_s3_file}")
    else:
        print(f"Invalid selection: {selection}")
        return False 


In [25]:
s3_uploader(selection = 'raw', 
            file_path='../data/processed/energy_clean.csv',
            bucket = "tkh-nyc-energy")

Successfully uploaded ../data/processed/energy_clean.csv to tkh-nyc-energy/energy_raw_06052024.csv


In [56]:
import boto3
import pandas as pd
import io

# some initial variables
bucket_name = "tkh-nyc-energy"
object_key = 'energy_raw_06032024.csv'

# open client
client = boto3.client('s3')

### DOWNLOADING SINGLE OBJECTS FROM A BUCKET ###
response = client.get_object(
    Bucket=bucket_name,
    Key=object_key,
)

# read in data from request
data = response['Body'].read()

# transform into pandas dataframe by reading in bytes
energy_raw = pd.read_csv(io.BytesIO(data))

# print head
energy_raw.head()


Unnamed: 0,borough,account_name,serial_number,funding_origin,total_bill,kwh_consumption,kwh_bill,kw_consumption,kw_bill,year_month,start_date,end_date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01,2010-04-26,2010-05-24


#### Task 1: Identify and Remove Irrelevant Features

In [57]:
energy_raw_selected_column = energy_raw[["borough", "account_name","serial_number","funding_origin","total_bill",
                                     'kwh_consumption','kwh_bill', 'kw_consumption', 'kw_bill',
                                     "year_month","start_date", "end_date"]]
energy_raw_selected_column.head()

Unnamed: 0,borough,account_name,serial_number,funding_origin,total_bill,kwh_consumption,kwh_bill,kw_consumption,kw_bill,year_month,start_date,end_date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01,2010-04-26,2010-05-24


### Task 1: Remove columns

In [78]:
energy_raw_selected_column.rename(columns= 
                                {"borough":"Borough", 
                                "account_name" :"Account_Name",
                                "serial_number":"Serial_Number",
                                "funding_origin":"Funding_Origin",
                                "total_bill":"Total_Bill",
                                "year_month":"Year_Month",
                                "kwh_consumption": "KWH_Consumption", 
                                "kwh_bill": "KWH_Bill",
                                "kw_consumption":"KW_Consumption",
                                "kw_bill": "KW_Bill",
                                "start_date": "Start_Date",
                                "end_date":"End_Date"},inplace = True)

energy_raw_selected_column.head(2)

Unnamed: 0,Borough,Account_Name,Serial_Number,Funding_Origin,Total_Bill,KWH_Consumption,KWH_Bill,KW_Consumption,KW_Bill,Year_Month,Start_Date,End_Date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25


In [79]:
energy_handle_missing_data = energy_raw_selected_column .copy()
energy_handle_missing_data.isnull().sum().sum()

0

In [80]:
energy_handle_missing_data.isnull().sum()

Borough            0
Account_Name       0
Serial_Number      0
Funding_Origin     0
Total_Bill         0
KWH_Consumption    0
KWH_Bill           0
KW_Consumption     0
KW_Bill            0
Year_Month         0
Start_Date         0
End_Date           0
dtype: int64

In [85]:
energy_handle_missing_data[energy_handle_missing_data.Start_Date.isnull()]
energy_handle_missing_data.head(5)


Unnamed: 0,Borough,Account_Name,Serial_Number,Funding_Origin,Total_Bill,KWH_Consumption,KWH_Bill,KW_Consumption,KW_Bill,Year_Month,Start_Date,End_Date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01,2010-04-26,2010-05-24


### Task 2: Handle Missing Data

In [82]:
columns_to_check= ["Start_Date","End_Date"]
energy_dropna_columns = energy_handle_missing_data.dropna(subset= columns_to_check)

In [86]:
energy_dropna_columns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447841 entries, 0 to 447840
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Borough          447841 non-null  object 
 1   Account_Name     447841 non-null  object 
 2   Serial_Number    447841 non-null  object 
 3   Funding_Origin   447841 non-null  object 
 4   Total_Bill       447841 non-null  float64
 5   KWH_Consumption  447841 non-null  float64
 6   KWH_Bill         447841 non-null  float64
 7   KW_Consumption   447841 non-null  float64
 8   KW_Bill          447841 non-null  float64
 9   Year_Month       447841 non-null  object 
 10  Start_Date       447841 non-null  object 
 11  End_Date         447841 non-null  object 
dtypes: float64(5), object(7)
memory usage: 41.0+ MB


### Task 3: fix the data type

In [87]:
energy_dropna_columns['Start_Date'] = pd.to_datetime(energy_dropna_columns['Start_Date'])
energy_dropna_columns['End_Date'] = pd.to_datetime(energy_dropna_columns['End_Date'])


energy_dropna_columns['Year_Month'] = energy_dropna_columns['Year_Month'].astype(str) + '-01'
energy_dropna_columns['Year_Month'] = pd.to_datetime(energy_dropna_columns['Year_Month'])
#energy_selected_column['revenue_month']  = energy_selected_column['revenue_month'].dt.strftime('%Y-%m-%d')


energy_dropna_columns['Total_Bill'] = energy_dropna_columns['Total_Bill'].astype(float)
energy_dropna_columns['KWH_Consumption'] = energy_dropna_columns ['KWH_Consumption'].astype(float)
energy_dropna_columns['KW_Consumption'] = energy_dropna_columns ['KW_Consumption'].astype(float)
energy_dropna_columns['KWH_Consumption'] = energy_dropna_columns ['KWH_Consumption'].astype(float)
energy_dropna_columns['KWH_Bill'] = energy_dropna_columns ['KW_Bill'].astype(float)
energy_dropna_columns.dtypes


Borough                    object
Account_Name               object
Serial_Number              object
Funding_Origin             object
Total_Bill                float64
KWH_Consumption           float64
KWH_Bill                  float64
KW_Consumption            float64
KW_Bill                   float64
Year_Month         datetime64[ns]
Start_Date         datetime64[ns]
End_Date           datetime64[ns]
dtype: object

In [88]:
energy_dropna_columns.head()

Unnamed: 0,Borough,Account_Name,Serial_Number,Funding_Origin,Total_Bill,KWH_Consumption,KWH_Bill,KW_Consumption,KW_Bill,Year_Month,Start_Date,End_Date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01 01:00:00,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01 01:00:00,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01 01:00:00,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01 01:00:00,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01 01:00:00,2010-04-26,2010-05-24


# Task 4: Clean Textual Data
- If applicable, clean textual data by removing extra spaces, correcting typos
- standardizing text (e.g., converting to lowercase).

In [90]:
energy_clean_textual_data = energy_dropna_columns.copy()
energy_clean_textual_data.head(2)

Unnamed: 0,Borough,Account_Name,Serial_Number,Funding_Origin,Total_Bill,KWH_Consumption,KWH_Bill,KW_Consumption,KW_Bill,Year_Month,Start_Date,End_Date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01 01:00:00,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01 01:00:00,2010-01-26,2010-02-25


In [95]:
# Convert to uppercase
energy_clean_textual_data['Borough'] = energy_clean_textual_data['Borough'].str.upper()
energy_clean_textual_data['Account_Name'] = energy_clean_textual_data['Account_Name'].str.upper()
energy_clean_textual_data['FUunding_Origin'] = energy_clean_textual_data['Funding_Origin'].str.upper()

In [96]:
# Remove extra spaces
energy_clean_textual_data['Borough'] = energy_clean_textual_data['Borough'].str.strip()  
energy_clean_textual_data['Account_Name'] = energy_clean_textual_data['Account_Name'].str.strip()  
energy_clean_textual_data['Funding_Origin'] = energy_clean_textual_data['Funding_Origin'].str.strip() 

# Replace multiple spaces with a single space
energy_clean_textual_data['Borough'] = energy_clean_textual_data['Borough'].str.replace(r'\s+', ' ', regex=True)
energy_clean_textual_data['Account_Name'] = energy_clean_textual_data['Account_Name'].str.replace(r'\s+', ' ', regex=True)

# Correct typos
# spell = Speller(lang='en')
# energy_clean_textual_data['borough'] = energy_clean_textual_data['borough'].apply(lambda x: spell(x))

# Remove punctuation
energy_clean_textual_data['Borough'] = energy_clean_textual_data['Borough'].str.replace(r'[^\w\s]', '', regex=True)
energy_clean_textual_data['Account_Name'] = energy_clean_textual_data['Account_Name'].str.replace(r'[^\w\s]', '', regex=True)

In [32]:
# E - Extract
# T - Transform 
# L - Load