In [34]:
import numpy as np
import cv2
from glob import glob
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import pandas as pd
from PIL import Image
import boto3
import boto3
import pandas as pd
from PIL import Image
import io

In [35]:


def read_aws_credentials(filename):
    credentials = {}
    with open(filename, 'r') as file:
        for line in file:
            key, value = line.strip().split('=')
            credentials[key] = value
    return credentials

creds_file = 'creds'
credentials = read_aws_credentials(creds_file)
aws_access_key_id = credentials.get('aws_access_key_id')
aws_secret_access_key = credentials.get('aws_secret_access_key')

s3_client = boto3.client(
    's3',
    region_name='eu-central-1',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)




In [36]:
def find_pictures(bucket_name, prefix):
    response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    photos = []
    while True:
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('.jpg'):
                photos.append(obj['Key'])
        if response['IsTruncated']:
            response = s3_client.list_objects_v2(
                Bucket=bucket_name, 
                Prefix=prefix, 
                ContinuationToken=response['NextContinuationToken']
            )
        else:
            break
    return photos



In [37]:
def read_image_data_from_s3(bucket_name, male_prefix, female_prefix):
    male_keys = find_pictures(bucket_name, male_prefix)
    female_keys = find_pictures(bucket_name, female_prefix)

    def create_df(photo_keys, gender):
        data = []
        for key in photo_keys:
            obj = s3_client.get_object(Bucket=bucket_name, Key=key)
            with Image.open(io.BytesIO(obj['Body'].read())) as img:
                width, height = img.size
                size = obj['ContentLength']
                image_info = {
                    'Filename': key.split('/')[-1],
                    'Path': key,
                    'Size': size,
                    'Width': width,
                    'Height': height,
                    'Gender': gender
                }
                data.append(image_info)
        return pd.DataFrame(data)

    male_df = create_df(male_keys, 'Male')
    female_df = create_df(female_keys, 'Female')

    return male_df, female_df



In [41]:
bucket_name = 'mypicturesmalefemale'
male_df, female_df = read_image_data_from_s3(bucket_name, 'male/', 'female/')

In [None]:
male_df.head(2)

Unnamed: 0,Filename,Path,Size,Width,Height,Gender
0,male_006969.jpg,male/male_006969.jpg,15573,500,500,Male
1,male_006970.jpg,male/male_006970.jpg,4000,138,138,Male


In [None]:
female_df.head(2)

Unnamed: 0,Filename,Path,Size,Width,Height,Gender
0,female_006982.jpg,female/female_006982.jpg,6908,217,218,Female
1,female_006984.jpg,female/female_006984.jpg,4056,131,131,Female


In [None]:
import matplotlib
matplotlib.use('Agg')

num_male = len(male_df)
num_female = len(female_df)
total = num_male + num_female
male_percentage = (num_male / total) * 100
female_percentage = (num_female / total) * 100
labels = 'Male', 'Female'
sizes = [male_percentage, female_percentage]
colors = ['blue', 'magenta']  # 'magenta' is widely supported

explode = (0.1, 0)
plt.figure(figsize=(8,6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.title('Percentage of Male and Female Images')
plt.show()

In [None]:
import matplotlib.pyplot as plt

def plot_boxplots(male_df, female_df):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.boxplot([male_df['Width'], female_df['Width']], labels=['Male', 'Female'])
    plt.title('Distribution of Image Widths')
    plt.ylabel('Pixels')

    # Plot for Heights
    plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
    plt.boxplot([male_df['Height'], female_df['Height']], labels=['Male', 'Female'])
    plt.title('Distribution of Image Heights')
    plt.ylabel('Pixels')

    plt.tight_layout()
    plt.show()
plot_boxplots(male_df, female_df)

  plt.boxplot([male_df['Width'], female_df['Width']], labels=['Male', 'Female'])
  plt.boxplot([male_df['Height'], female_df['Height']], labels=['Male', 'Female'])


In [None]:
def plot_size_distribution(male_df, female_df):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(male_df['Size'], bins=20, color='blue', alpha=0.7, label='Male Heights')
    plt.title('Distribution of Male Image Heights')
    plt.xlabel('Height (pixels)')
    plt.ylabel('Frequency')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.hist(female_df['Size'], bins=20, color='pink', alpha=0.7, label='Female Heights')
    plt.title('Distribution of Female Image Heights')
    plt.xlabel('Size')
    plt.ylabel('Frequency')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
def plot_width_distribution(male_df, female_df):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(male_df['Width'], bins=20, color='blue', alpha=0.7, label='Male Heights')
    plt.title('Distribution of Male Image Heights')
    plt.xlabel('Height (pixels)')
    plt.ylabel('Frequency')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.hist(female_df['Width'], bins=20, color='pink', alpha=0.7, label='Female Heights')
    plt.title('Distribution of Female Image Heights')
    plt.xlabel('width (pixels)')
    plt.ylabel('Frequency')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:

plot_width_distribution(male_df, female_df)

In [None]:
def plot_height_distribution(male_df, female_df):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(male_df['Height'], bins=20, color='blue', alpha=0.7, label='Male Heights')
    plt.title('Distribution of Male Image Heights')
    plt.xlabel('Height (pixels)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.hist(female_df['Height'], bins=20, color='pink', alpha=0.7, label='Female Heights')
    plt.title('Distribution of Female Image Heights')
    plt.xlabel('Height (pixels)')
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
plot_height_distribution(male_df, female_df)

In [None]:
def filter_by_dimension(df):
    filtered_df = df[(df['Width'] < 500) & (df['Height'] < 500)]
    return filtered_df

filtered_male_df = filter_by_dimension(male_df)
filtered_female_df = filter_by_dimension(female_df)

In [None]:

print("Filtered by Dimension - Male Images:", len(filtered_male_df))
print("Filtered by Dimension - Female Images:", len(filtered_female_df))

Filtered by Dimension - Male Images: 8
Filtered by Dimension - Female Images: 11


In [None]:
filtered_male_df

Unnamed: 0,Filename,Path,Size,Width,Height,Gender
1,male_006970.jpg,male/male_006970.jpg,4000,138,138,Male
2,male_006971.jpg,male/male_006971.jpg,3157,114,114,Male
3,male_006973.jpg,male/male_006973.jpg,8698,263,262,Male
5,male_006975.jpg,male/male_006975.jpg,14995,295,295,Male
6,male_006976.jpg,male/male_006976.jpg,22140,398,397,Male
7,male_006978.jpg,male/male_006978.jpg,8580,198,197,Male
8,male_006979.jpg,male/male_006979.jpg,13018,282,282,Male
9,male_006980.jpg,male/male_006980.jpg,25109,445,444,Male


In [None]:
def resize_data_s3(bucket_name, key, s3_client, upload_resized=False):
    try:
        obj = s3_client.get_object(Bucket=bucket_name, Key=key)
        img_data = obj['Body'].read()

        if not img_data:
            print(f"No data retrieved for key {key}")
            return None
        img = Image.open(io.BytesIO(img_data))
        img_format = img.format  # Keep the original image format

        img_np = np.array(img)

        if len(img_np.shape) == 2:
            gray_img = img_np
        elif len(img_np.shape) == 3:
            gray_img = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
        else:
            print(f"Unexpected image shape {img_np.shape} for key {key}")
            return None

        size = gray_img.shape[0]
        if size >= 100:
            resized_img = cv2.resize(gray_img, (100, 100), interpolation=cv2.INTER_AREA)
        else:
            resized_img = cv2.resize(gray_img, (100, 100), interpolation=cv2.INTER_CUBIC)
        flat_img = resized_img.flatten()
        if upload_resized:
            resized_pil_img = Image.fromarray(resized_img)
            buffer = io.BytesIO()
            resized_pil_img.save(buffer, format=img_format)
            buffer.seek(0)
            new_key = 'resized/' + key.split('/')[-1] 
            s3_client.put_object(Bucket=bucket_name, Key=new_key, Body=buffer.getvalue(), ContentType='image/jpeg')

        return flat_img

    except Exception as e:
        print(f"Error processing key {key}: {e}")
        return None


In [None]:
# Process male images
filtered_male_df['data'] = filtered_male_df['Path'].apply(lambda key: resize_data_s3(bucket_name, key, s3_client, upload_resized=True))

# Process female images
filtered_female_df['data'] = filtered_female_df['Path'].apply(lambda key: resize_data_s3(bucket_name, key, s3_client, upload_resized=True))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_male_df['data'] = filtered_male_df['Path'].apply(lambda key: resize_data_s3(bucket_name, key, s3_client, upload_resized=True))


In [None]:
filtered_male_df['data'].head()

1    [192, 189, 185, 184, 186, 188, 188, 186, 186, ...
2    [111, 112, 118, 118, 119, 122, 113, 111, 116, ...
3    [204, 204, 204, 203, 203, 203, 203, 203, 203, ...
5    [234, 237, 232, 240, 234, 236, 234, 233, 232, ...
6    [13, 13, 14, 15, 19, 21, 20, 21, 21, 23, 22, 2...
Name: data, dtype: object

In [None]:
df = pd.concat([filtered_female_df, filtered_male_df], axis=0)
data_df = df.sample(frac=1).reset_index(drop=True)

In [None]:
data =data_df["data"].apply(pd.Series)

In [None]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,192,189,185,184,186,188,188,186,186,187,...,96,99,90,87,84,91,102,107,105,102
1,151,220,233,233,233,236,235,235,236,236,...,50,46,91,68,50,56,58,53,50,58
2,234,237,232,240,234,236,234,233,232,234,...,30,34,73,152,57,40,59,64,64,64
3,196,196,196,196,197,197,197,198,198,198,...,163,156,146,125,105,116,110,105,103,103
4,117,122,216,228,226,227,209,128,115,115,...,106,104,103,102,103,101,102,102,102,102


In [None]:
data.max().max()

np.uint8(255)

In [None]:
data = data/data.max().max()
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,0.752941,0.741176,0.72549,0.721569,0.729412,0.737255,0.737255,0.729412,0.729412,0.733333,...,0.376471,0.388235,0.352941,0.341176,0.329412,0.356863,0.4,0.419608,0.411765,0.4
1,0.592157,0.862745,0.913725,0.913725,0.913725,0.92549,0.921569,0.921569,0.92549,0.92549,...,0.196078,0.180392,0.356863,0.266667,0.196078,0.219608,0.227451,0.207843,0.196078,0.227451
2,0.917647,0.929412,0.909804,0.941176,0.917647,0.92549,0.917647,0.913725,0.909804,0.917647,...,0.117647,0.133333,0.286275,0.596078,0.223529,0.156863,0.231373,0.25098,0.25098,0.25098
3,0.768627,0.768627,0.768627,0.768627,0.772549,0.772549,0.772549,0.776471,0.776471,0.776471,...,0.639216,0.611765,0.572549,0.490196,0.411765,0.454902,0.431373,0.411765,0.403922,0.403922
4,0.458824,0.478431,0.847059,0.894118,0.886275,0.890196,0.819608,0.501961,0.45098,0.45098,...,0.415686,0.407843,0.403922,0.4,0.403922,0.396078,0.4,0.4,0.4,0.4


In [None]:
data["Gender"] = data_df["Gender"]
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,Gender
0,0.752941,0.741176,0.72549,0.721569,0.729412,0.737255,0.737255,0.729412,0.729412,0.733333,...,0.388235,0.352941,0.341176,0.329412,0.356863,0.4,0.419608,0.411765,0.4,Male
1,0.592157,0.862745,0.913725,0.913725,0.913725,0.92549,0.921569,0.921569,0.92549,0.92549,...,0.180392,0.356863,0.266667,0.196078,0.219608,0.227451,0.207843,0.196078,0.227451,Female
2,0.917647,0.929412,0.909804,0.941176,0.917647,0.92549,0.917647,0.913725,0.909804,0.917647,...,0.133333,0.286275,0.596078,0.223529,0.156863,0.231373,0.25098,0.25098,0.25098,Male
3,0.768627,0.768627,0.768627,0.768627,0.772549,0.772549,0.772549,0.776471,0.776471,0.776471,...,0.611765,0.572549,0.490196,0.411765,0.454902,0.431373,0.411765,0.403922,0.403922,Female
4,0.458824,0.478431,0.847059,0.894118,0.886275,0.890196,0.819608,0.501961,0.45098,0.45098,...,0.407843,0.403922,0.4,0.403922,0.396078,0.4,0.4,0.4,0.4,Female


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Columns: 10001 entries, 0 to Gender
dtypes: float64(10000), object(1)
memory usage: 1.4+ MB


###  Writing Dataset in Snowflake 

In [49]:
import pandas as pd
import json
import os
import snowflake.snowpark.functions as F
from snowflake.snowpark import Session
from dotenv import load_dotenv
from snowflake.connector.pandas_tools import write_pandas
import snowflake.connector

load_dotenv(".env")

True

In [50]:
conn = snowflake.connector.connect(
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASSWORD"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=os.getenv("SNOWFLAKE_DATABASE"),
    schema=os.getenv("SNOWFLAKE_SCHEMA"),
    role=os.getenv("SNOWFLAKE_ROLE")
)

In [51]:
data.columns = [col if col == 'Gender' else f'column{i}' for i, col in enumerate(data.columns)]
data.head()

Unnamed: 0,column0,column1,column2,column3,column4,column5,column6,column7,column8,column9,...,column9991,column9992,column9993,column9994,column9995,column9996,column9997,column9998,column9999,Gender
0,0.752941,0.741176,0.72549,0.721569,0.729412,0.737255,0.737255,0.729412,0.729412,0.733333,...,0.388235,0.352941,0.341176,0.329412,0.356863,0.4,0.419608,0.411765,0.4,Male
1,0.592157,0.862745,0.913725,0.913725,0.913725,0.92549,0.921569,0.921569,0.92549,0.92549,...,0.180392,0.356863,0.266667,0.196078,0.219608,0.227451,0.207843,0.196078,0.227451,Female
2,0.917647,0.929412,0.909804,0.941176,0.917647,0.92549,0.917647,0.913725,0.909804,0.917647,...,0.133333,0.286275,0.596078,0.223529,0.156863,0.231373,0.25098,0.25098,0.25098,Male
3,0.768627,0.768627,0.768627,0.768627,0.772549,0.772549,0.772549,0.776471,0.776471,0.776471,...,0.611765,0.572549,0.490196,0.411765,0.454902,0.431373,0.411765,0.403922,0.403922,Female
4,0.458824,0.478431,0.847059,0.894118,0.886275,0.890196,0.819608,0.501961,0.45098,0.45098,...,0.407843,0.403922,0.4,0.403922,0.396078,0.4,0.4,0.4,0.4,Female


In [52]:
try:
    
    success, nchunks, nrows, _ = write_pandas(conn, data, 'DATAPICTURE',
                                              auto_create_table=True,
                                              quote_identifiers=True)
    print(f"Data written successfully: {nrows} rows in {nchunks} chunks.")
except Exception as e:
    print(f"An error occurred: {str(e)}")
finally:

    conn.close()

Data written successfully: 19 rows in 1 chunks.


### Writing Dataset in AWS

In [None]:
s3_resource = boto3.resource('s3', region_name='eu-central-1', aws_access_key_id=aws_access_key_id,  aws_secret_access_key=aws_secret_access_key)
bucket = 'mypicturesmalefemale'  
csv_buffer = StringIO()
data.to_csv(csv_buffer)
s3_resource.Object(bucket, 'data.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '7MQ16FW44QNWZMPB',
  'HostId': 'L6quobUdoFCGq6bNAtOs7SzKcS0y318OIs985XgsVWuCbw3jLi/qh21nkP+DZJvbwaZm6IOLYBc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'L6quobUdoFCGq6bNAtOs7SzKcS0y318OIs985XgsVWuCbw3jLi/qh21nkP+DZJvbwaZm6IOLYBc=',
   'x-amz-request-id': '7MQ16FW44QNWZMPB',
   'date': 'Fri, 13 Sep 2024 19:24:40 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"3514ffd7ddd5d5b613c922038cbd1efa"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"3514ffd7ddd5d5b613c922038cbd1efa"',
 'ServerSideEncryption': 'AES256'}