In [1]:
import boto3
import pandas as pd
import numpy as np

In [2]:
def download_file_from_s3(bucket, object_name, local_file_name):
    """
    Download a file from S3 to the local file system.

    :param bucket: Name of the S3 bucket
    :param object_name: S3 object name
    :param local_file_name: Local file name to save the downloaded file
    """
    # # When using IAM roles, boto3 retrieves credentials from the instance metadata
    # s3_client = boto3.client('s3')

    #When setting up credentials locally, use the following code
    session = boto3.Session()
    s3_client = session.client('s3')

    s3_client.download_file(bucket, object_name, local_file_name)

bucket_name = 'myukdata'
s3_file_name = 'Original/Member/Member_Experience.csv'
local_file = 'Member_Experience.csv'

download_file_from_s3(bucket_name, s3_file_name, local_file)

In [3]:
df = pd.read_csv('Member_Experience.csv')
df.info()
pd.set_option('display.max_rows', None)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6282 entries, 0 to 6281
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   memberID      6282 non-null   int64  
 1   id            6282 non-null   int64  
 2   type          6282 non-null   object 
 3   typeId        6282 non-null   int64  
 4   title         6282 non-null   object 
 5   organisation  6175 non-null   object 
 6   startMonth    68 non-null     float64
 7   startYear     6227 non-null   float64
 8   endMonth      36 non-null     float64
 9   endYear       4253 non-null   float64
dtypes: float64(4), int64(3), object(3)
memory usage: 490.9+ KB


Unnamed: 0,memberID,id,type,typeId,title,organisation,startMonth,startYear,endMonth,endYear
0,43,6469,Political,3,Chair,West Anglia Task Force,,2015.0,,
1,43,5106,Public life,2,Chairman,Community Development Foundation,,1986.0,,1997.0
2,43,5107,Public life,2,Governor,Felsted School,,1985.0,,2012.0
3,43,5098,Political,3,Deputy Chairman,Conservative Group for Europe,,1982.0,,1985.0
4,43,5105,Public life,2,Chairman,Commonwealth Youth Exchange Council,,1978.0,,1981.0


In [4]:
df.head()

Unnamed: 0,memberID,id,type,typeId,title,organisation,startMonth,startYear,endMonth,endYear
0,43,6469,Political,3,Chair,West Anglia Task Force,,2015.0,,
1,43,5106,Public life,2,Chairman,Community Development Foundation,,1986.0,,1997.0
2,43,5107,Public life,2,Governor,Felsted School,,1985.0,,2012.0
3,43,5098,Political,3,Deputy Chairman,Conservative Group for Europe,,1982.0,,1985.0
4,43,5105,Public life,2,Chairman,Commonwealth Youth Exchange Council,,1978.0,,1981.0


In [5]:
df.drop(columns=['typeId',
                 'id',
                 'startMonth',
                 'endMonth'], inplace=True)
df.rename(columns={'memberID': 'memberId'}, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6282 entries, 0 to 6281
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   memberId      6282 non-null   int64  
 1   type          6282 non-null   object 
 2   title         6282 non-null   object 
 3   organisation  6175 non-null   object 
 4   startYear     6227 non-null   float64
 5   endYear       4253 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 294.6+ KB


In [7]:
#Save the file to S3
import boto3
import io

def upload_df_to_s3(df, bucket, object_name):
    """
    Upload a DataFrame to an S3 bucket as CSV.

    :param df: DataFrame to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name
    :return: True if the DataFrame was uploaded, else False
    """
    # Create a buffer
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Move to the start of the buffer
    csv_buffer.seek(0)

    # Upload the buffer content to S3
    s3_client = boto3.client('s3')
    try:
        s3_client.put_object(Bucket=bucket, Key=object_name, Body=csv_buffer.getvalue())
    except ClientError as e:
        logging.error(e)
        return False
    return True

bucket_name = 'myukdata'
object_name = 'Preprocessing/Member/MemberExperience.csv'
upload_df_to_s3(df, bucket_name, object_name)


True