In [1]:
import boto3
import pandas as pd
import numpy as np

In [2]:
def download_file_from_s3(bucket, object_name, local_file_name):
    """
    Download a file from S3 to the local file system.

    :param bucket: Name of the S3 bucket
    :param object_name: S3 object name
    :param local_file_name: Local file name to save the downloaded file
    """
    # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')

    s3_client.download_file(bucket, object_name, local_file_name)

bucket_name = 'myukdata'
s3_file_name = 'Original/Member/Member_OtherPosts.csv'
local_file = 'Member_OtherPosts.csv'
download_file_from_s3(bucket_name, s3_file_name, local_file)

In [3]:
df = pd.read_csv('Member_OtherPosts.csv')
df.info()
pd.set_option('display.max_rows', None)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   memberID            613 non-null    int64  
 1   house               562 non-null    float64
 2   name                613 non-null    object 
 3   id                  613 non-null    int64  
 4   startDate           613 non-null    object 
 5   endDate             542 non-null    object 
 6   additionalInfo      0 non-null      float64
 7   additionalInfoLink  0 non-null      float64
dtypes: float64(3), int64(2), object(3)
memory usage: 38.4+ KB


Unnamed: 0,memberID,house,name,id,startDate,endDate,additionalInfo,additionalInfoLink
0,7,1.0,"Deputy Chair, Conservative Party",766,2004-09-01T00:00:00,2005-11-01T00:00:00,,
1,8,1.0,Leader of the Conservative Party,787,2016-07-11T00:00:00,2019-06-07T00:00:00,,
2,8,1.0,"Party Chair, Conservative Party",805,2002-07-23T00:00:00,2003-12-06T00:00:00,,
3,11,1.0,"Vice-Chair, Labour Party",824,2007-06-28T00:00:00,2010-05-06T00:00:00,,
4,13,1.0,"Deputy Chair, Conservative Party",766,1995-07-01T00:00:00,1998-07-01T00:00:00,,


In [4]:
df.drop(columns=['house',
                 'id',
                 'additionalInfo',
                 'additionalInfoLink'], inplace=True)
df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])

In [5]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   memberID   613 non-null    int64         
 1   name       613 non-null    object        
 2   startDate  613 non-null    datetime64[ns]
 3   endDate    542 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 19.3+ KB


Unnamed: 0,memberID,name,startDate,endDate
0,7,"Deputy Chair, Conservative Party",2004-09-01,2005-11-01
1,8,Leader of the Conservative Party,2016-07-11,2019-06-07
2,8,"Party Chair, Conservative Party",2002-07-23,2003-12-06
3,11,"Vice-Chair, Labour Party",2007-06-28,2010-05-06
4,13,"Deputy Chair, Conservative Party",1995-07-01,1998-07-01


In [6]:
df.rename(columns={'memberID': 'memberId',
                     'name': 'postName'}, inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   memberId   613 non-null    int64         
 1   postName   613 non-null    object        
 2   startDate  613 non-null    datetime64[ns]
 3   endDate    542 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 19.3+ KB


In [8]:
df['postName']

0                       Deputy Chair, Conservative Party
1                       Leader of the Conservative Party
2                        Party Chair, Conservative Party
3                               Vice-Chair, Labour Party
4                       Deputy Chair, Conservative Party
5                        Speaker of the House of Commons
6      Chair, Parliamentary Office of Science and Tec...
7                         Vice-Chair, Conservative Party
8                        Party Chair, Conservative Party
9                       Deputy Chair, Conservative Party
10                        Vice-Chair, Conservative Party
11         Deputy Speaker and Chairman of Ways and Means
12     Deputy Speaker (First Deputy Chairman of Ways ...
13         Vice-Chair, Conservative Party 1922 Committee
14                      Deputy Chair, Conservative Party
15     Chair, Parliamentary Office of Science and Tec...
16         Deputy Speaker and Chairman of Ways and Means
17         Deputy Speaker and C

In [9]:
#Save the file to S3
import boto3
import io

def upload_df_to_s3(df, bucket, object_name):
    """
    Upload a DataFrame to an S3 bucket as CSV.

    :param df: DataFrame to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name
    :return: True if the DataFrame was uploaded, else False
    """
    # Create a buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Move to the start of the buffer
    csv_buffer.seek(0)

    # Upload the buffer content to S3
    s3_client = boto3.client('s3')
    try:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=csv_buffer.getvalue())
    except ClientError as e:
        logging.error(e)
        return False
    return True

bucket_name = 'myukdata'
object_name = 'Preprocessing/Member/MemberOtherPosts.csv'
upload_df_to_s3(df, bucket_name, object_name)


True