In [3]:
import boto3
import pandas as pd
import io

# some initial variables
bucket_name = "tkh-nyc-energy"
object_key = 'energy_raw_06032024.csv'

# open client
client = boto3.client('s3')

### DOWNLOADING SINGLE OBJECTS FROM A BUCKET ###
response = client.get_object(
    Bucket=bucket_name,
    Key=object_key,
)

# read in data from request
data = response['Body'].read()

# transform into pandas dataframe by reading in bytes
energy_raw = pd.read_csv(io.BytesIO(data))

# print head
energy_raw.head()


Unnamed: 0,borough,account_name,serial_number,funding_origin,total_bill,kwh_consumption,kwh_bill,kw_consumption,kw_bill,year_month,start_date,end_date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01,2010-04-26,2010-05-24


In [4]:
def selected_columns(df):
    df_selected_column = df[["borough", "account_name","serial_number","funding_origin","total_bill",
                                     'kwh_consumption','kwh_bill', 'kw_consumption', 'kw_bill',
                                     "year_month","start_date", "end_date"]]
    return df_selected_column

energy_raw_selected_column = selected_columns(energy_raw)

In [5]:
def rename_columns(df):
    rename_dict = {
        "borough": "Borough", 
        "account_name": "Account_Name",
        "serial_number": "Serial_Number",
        "funding_origin": "Funding_Origin",
        "total_bill": "Total_Bill",
        "year_month": "Year_Month",
        "kwh_consumption": "KWH_Consumption", 
        "kwh_bill": "KWH_Bill",
        "kw_consumption": "KW_Consumption",
        "kw_bill": "KW_Bill",
        "start_date": "Start_Date",
        "end_date": "End_Date"
    }
    
    df_rename = df.rename(columns=rename_dict)
    return df_rename

energy_rename_column = rename_columns(energy_raw_selected_column)
energy_rename_column.head()

Unnamed: 0,Borough,Account_Name,Serial_Number,Funding_Origin,Total_Bill,KWH_Consumption,KWH_Bill,KW_Consumption,KW_Bill,Year_Month,Start_Date,End_Date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01,2010-04-26,2010-05-24


### Task 2: Handle Missing Data

In [6]:
def drop_columns(df):
    columns = ["Start_Date","End_Date"]
    df_drop = df.dropna(subset=columns)
    return df_drop

energy_drop_columns = drop_columns(energy_rename_column)
energy_drop_columns.head()

Unnamed: 0,Borough,Account_Name,Serial_Number,Funding_Origin,Total_Bill,KWH_Consumption,KWH_Bill,KW_Consumption,KW_Bill,Year_Month,Start_Date,End_Date
0,bronx,adams,7223256,federal,15396.82,128800.0,2808.0,216.0,2808.0,2010-01-01,2009-12-24,2010-01-26
1,bronx,adams,7223256,federal,14556.34,115200.0,2912.0,224.0,2912.0,2010-02-01,2010-01-26,2010-02-25
2,bronx,adams,7223256,federal,13904.98,103200.0,2808.0,216.0,2808.0,2010-03-01,2010-02-25,2010-03-26
3,bronx,adams,7223256,federal,14764.04,105600.0,2704.0,208.0,2704.0,2010-04-01,2010-03-26,2010-04-26
4,bronx,adams,7223256,federal,13729.54,97600.0,2808.0,216.0,2808.0,2010-05-01,2010-04-26,2010-05-24


In [7]:
def fix_data_types_columns(df):
  df['Start_Date'] = pd.to_datetime(df['Start_Date'])
  df['End_Date'] = pd.to_datetime(df['End_Date'])

  df['Year_Month'] = df['Year_Month'].astype(str) + '-01'
  df['Year_Month'] = pd.to_datetime(df['Year_Month'])

  float_columns = ["Total_Bill", "KWH_Consumption", "KW_Consumption", "KW_Bill", "KWH_Bill"]
  df[float_columns] = df[float_columns].astype(float)

  return df

energy_fix_data_types_column = fix_data_types_columns(energy_drop_columns)
print(energy_fix_data_types_column.dtypes)




Borough                    object
Account_Name               object
Serial_Number              object
Funding_Origin             object
Total_Bill                float64
KWH_Consumption           float64
KWH_Bill                  float64
KW_Consumption            float64
KW_Bill                   float64
Year_Month         datetime64[ns]
Start_Date         datetime64[ns]
End_Date           datetime64[ns]
dtype: object


In [8]:
def clean_textual_column(df):
    columns = ["Borough", "Account_Name","Funding_Origin"]
    for column in columns:
        df[column] = df[column].str.strip()
        df[column] = df[column].str.replace(r'\s+', ' ', regex=True)
        df[column] = df[column].str.replace(r'[^\w\s]', ' ', regex=True)

    return df


energy_clean_textual_column = clean_textual_column(energy_fix_data_types_column)
print(energy_clean_textual_column.head())

  Borough Account_Name Serial_Number Funding_Origin  Total_Bill  \
0   bronx        adams       7223256        federal    15396.82   
1   bronx        adams       7223256        federal    14556.34   
2   bronx        adams       7223256        federal    13904.98   
3   bronx        adams       7223256        federal    14764.04   
4   bronx        adams       7223256        federal    13729.54   

   KWH_Consumption  KWH_Bill  KW_Consumption  KW_Bill          Year_Month  \
0         128800.0    2808.0           216.0   2808.0 2010-01-01 01:00:00   
1         115200.0    2912.0           224.0   2912.0 2010-02-01 01:00:00   
2         103200.0    2808.0           216.0   2808.0 2010-03-01 01:00:00   
3         105600.0    2704.0           208.0   2704.0 2010-04-01 01:00:00   
4          97600.0    2808.0           216.0   2808.0 2010-05-01 01:00:00   

  Start_Date   End_Date  
0 2009-12-24 2010-01-26  
1 2010-01-26 2010-02-25  
2 2010-02-25 2010-03-26  
3 2010-03-26 2010-04-26  
4 20