In [1]:
import boto3
import pandas as pd
import numpy as np


In [2]:
def download_file_from_s3(bucket, object_name, local_file_name):
    """
    Download a file from S3 to the local file system.

    :param bucket: Name of the S3 bucket
    :param object_name: S3 object name
    :param local_file_name: Local file name to save the downloaded file
    """
    # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')

    s3_client.download_file(bucket, object_name, local_file_name)

bucket_name = 'myukdata'
s3_file_name = 'Original/CommonDivision/CommonDivisionMembers/CommonDivisionMembers.csv'
local_file = 'CommonDivisionMembers.csv'

download_file_from_s3(bucket_name, s3_file_name, local_file)



In [3]:
#Read the csv files into pandas dataframes
df = pd.read_csv('CommonDivisionMembers.csv')
print(f"{len(df)} and {len(df.columns)}")
df.info()
pd.set_option('display.max_rows', None)
df.head()

22606 and 24
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22606 entries, 0 to 22605
Data columns (total 24 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   MemberId                                  22606 non-null  int64  
 1   MemberVotedAye                            22606 non-null  bool   
 2   MemberWasTeller                           22606 non-null  bool   
 3   PublishedDivision.DivisionId              22606 non-null  int64  
 4   PublishedDivision.Date                    22606 non-null  object 
 5   PublishedDivision.PublicationUpdated      22606 non-null  object 
 6   PublishedDivision.Number                  22606 non-null  int64  
 7   PublishedDivision.IsDeferred              22606 non-null  bool   
 8   PublishedDivision.EVELType                104 non-null    object 
 9   PublishedDivision.EVELCountry             104 non-null    object 
 10  PublishedDivision.Tit

Unnamed: 0,MemberId,MemberVotedAye,MemberWasTeller,PublishedDivision.DivisionId,PublishedDivision.Date,PublishedDivision.PublicationUpdated,PublishedDivision.Number,PublishedDivision.IsDeferred,PublishedDivision.EVELType,PublishedDivision.EVELCountry,...,PublishedDivision.DoubleMajorityNoCount,PublishedDivision.AyeTellers,PublishedDivision.NoTellers,PublishedDivision.Ayes,PublishedDivision.Noes,PublishedDivision.FriendlyDescription,PublishedDivision.FriendlyTitle,PublishedDivision.NoVoteRecorded,PublishedDivision.RemoteVotingStart,PublishedDivision.RemoteVotingEnd
0,2,True,False,732,2019-10-29T14:03:00,2019-10-29T20:45:45,14,False,,,...,,"[{'MemberId': 4433, 'Name': 'Thangam Debbonair...","[{'MemberId': 4032, 'Name': 'Stuart Andrew', '...",[],[],,,[],,
1,2,False,False,730,2019-10-24T17:30:00,2019-10-25T10:36:13,12,False,,,...,,"[{'MemberId': 4447, 'Name': 'Maggie Throup', '...","[{'MemberId': 4433, 'Name': 'Thangam Debbonair...",[],[],,,[],,
2,2,True,False,728,2019-10-24T16:59:00,2019-10-24T18:08:59,10,False,,,...,,"[{'MemberId': 4433, 'Name': 'Thangam Debbonair...","[{'MemberId': 3992, 'Name': 'James Morris', 'P...",[],[],,,[],,
3,2,True,False,727,2019-10-23T18:59:00,2019-10-23T19:21:56,9,False,,,...,,"[{'MemberId': 4456, 'Name': 'Jeff Smith', 'Par...","[{'MemberId': 4407, 'Name': 'Nigel Huddleston'...",[],[],,,[],,
4,2,False,False,726,2019-10-23T00:00:00,2019-10-23T15:32:07,8,True,,,...,,,,[],[],,,[],,


In [4]:
df.drop(columns=['PublishedDivision.EVELType',
                 'PublishedDivision.EVELCountry',
                'PublishedDivision.DoubleMajorityAyeCount',
                'PublishedDivision.DoubleMajorityNoCount',
                'PublishedDivision.AyeTellers',
                'PublishedDivision.NoTellers',
                'PublishedDivision.Ayes',
                'PublishedDivision.Noes',
                'PublishedDivision.FriendlyDescription',    
                'PublishedDivision.FriendlyTitle',
                'PublishedDivision.NoVoteRecorded',
                'PublishedDivision.RemoteVotingStart',
                'PublishedDivision.RemoteVotingEnd'], inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22606 entries, 0 to 22605
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   MemberId                              22606 non-null  int64 
 1   MemberVotedAye                        22606 non-null  bool  
 2   MemberWasTeller                       22606 non-null  bool  
 3   PublishedDivision.DivisionId          22606 non-null  int64 
 4   PublishedDivision.Date                22606 non-null  object
 5   PublishedDivision.PublicationUpdated  22606 non-null  object
 6   PublishedDivision.Number              22606 non-null  int64 
 7   PublishedDivision.IsDeferred          22606 non-null  bool  
 8   PublishedDivision.Title               22606 non-null  object
 9   PublishedDivision.AyeCount            22606 non-null  int64 
 10  PublishedDivision.NoCount             22606 non-null  int64 
dtypes: bool(3), int64(5), object

In [6]:
df.rename(columns={'MemberId': 'memberId',
                     'MemberVotedAye': 'memberVotedAye',
                     'MemberWasTeller':'memberWasTeller',
                     'PublishedDivision.DivisionId':'divisionId',
                     'PublishedDivision.Date':'divisionDate',
                     'PublishedDivision.PublicationUpdated':'updatedDate',
                     'PublishedDivision.Number':'nthDivision',
                     'PublishedDivision.IsDeferred':'divisionDeferred',
                     'PublishedDivision.Title':'divisiontitle',
                     'PublishedDivision.AyeCount':'ayeCount',
                     'PublishedDivision.NoCount':'noCount'}, inplace=True)
df['divisionDate'] = pd.to_datetime(df['divisionDate'])
df['updatedDate'] = pd.to_datetime(df['updatedDate'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22606 entries, 0 to 22605
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   memberId          22606 non-null  int64         
 1   memberVotedAye    22606 non-null  bool          
 2   memberWasTeller   22606 non-null  bool          
 3   divisionId        22606 non-null  int64         
 4   divisionDate      22606 non-null  datetime64[ns]
 5   updatedDate       22606 non-null  datetime64[ns]
 6   nthDivision       22606 non-null  int64         
 7   divisionDeferred  22606 non-null  bool          
 8   divisiontitle     22606 non-null  object        
 9   ayeCount          22606 non-null  int64         
 10  noCount           22606 non-null  int64         
dtypes: bool(3), datetime64[ns](2), int64(5), object(1)
memory usage: 1.4+ MB


In [8]:
df.head()

Unnamed: 0,memberId,memberVotedAye,memberWasTeller,divisionId,divisionDate,updatedDate,nthDivision,divisionDeferred,divisiontitle,ayeCount,noCount
0,2,True,False,732,2019-10-29 14:03:00,2019-10-29 20:45:45,14,False,Early Parliamentary General Election Bill (Bus...,312,295
1,2,False,False,730,2019-10-24 17:30:00,2019-10-25 10:36:13,12,False,The Queen's Speech: motion for an humble Address,310,294
2,2,True,False,728,2019-10-24 16:59:00,2019-10-24 18:08:59,10,False,The Queen's Speech: the economy - Jeremy Corby...,293,311
3,2,True,False,727,2019-10-23 18:59:00,2019-10-23 19:21:56,9,False,The Queen's Speech: the NHS - Jeremy Corbyn's ...,282,310
4,2,False,False,726,2019-10-23 00:00:00,2019-10-23 15:32:07,8,True,Exiting the European Union (Financial Services...,315,284


In [9]:
#Save the file to S3
import boto3
import io

def upload_df_to_s3(df, bucket, object_name):
    """
    Upload a DataFrame to an S3 bucket as CSV.

    :param df: DataFrame to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name
    :return: True if the DataFrame was uploaded, else False
    """
    # Create a buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Move to the start of the buffer
    csv_buffer.seek(0)

    # Upload the buffer content to S3
    s3_client = boto3.client('s3')
    try:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=csv_buffer.getvalue())
    except ClientError as e:
        logging.error(e)
        return False
    return True

bucket_name = 'myukdata'
object_name = 'Preprocessing/CommonDivision/CommonDivision.csv'
upload_df_to_s3(df, bucket_name, object_name)


True