Imports & boto3 Client

In [3]:
import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


Initialize S3 client (uses your AWS CLI credentials):

In [2]:
s3 = boto3.client("s3")


2. Download Raw Dataset from S3

In [6]:
bucket_name = "aws-practice-visualpath"
key = "housing/house_new_price.csv"
local_raw_file = "raw_housing.csv"

s3.download_file(bucket_name, key, local_raw_file)
print("Downloaded raw file from S3.")


Downloaded raw file from S3.


3. Load Data into Pandas

In [7]:
df = pd.read_csv(local_raw_file)
print("Raw Shape:", df.shape)
df.head()


Raw Shape: (545, 3)


Unnamed: 0.1,Unnamed: 0,price,area
0,0,13300000,7420
1,1,12250000,8960
2,2,12250000,9960
3,3,12215000,7500
4,4,11410000,7420


4. Remove Duplicates

In [8]:
df = df.drop_duplicates().reset_index(drop=True)
print("After removing duplicates:", df.shape)


After removing duplicates: (545, 3)


5. Outlier Removal (IQR Method)

In [10]:
def remove_outliers(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
df = remove_outliers(df, numeric_cols)

print("Shape after outlier removal:", df.shape)


Shape after outlier removal: (517, 3)


6. Encoding Categorical Columns

In [11]:
label_encoder = LabelEncoder()

for col in df.select_dtypes(include=["object"]).columns:
    df[col] = label_encoder.fit_transform(df[col])


7. Feature Scaling

In [12]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])


8. Save Cleaned File Locally

In [13]:
cleaned_file = "cleaned_housing.csv"
df.to_csv(cleaned_file, index=False)
print("Saved cleaned dataset locally.")


Saved cleaned dataset locally.


9. Upload Cleaned Dataset Back to S3

In [14]:
cleaned_key = "processed/cleaned_housing.csv"

s3.upload_file(cleaned_file, bucket_name, cleaned_key)
print("Uploaded cleaned dataset back to S3.")


Uploaded cleaned dataset back to S3.


10. Start Athena Query

In [21]:
athena = boto3.client("athena")

query = """
SELECT * FROM house_price.housing_cleaned LIMIT 10;
"""

output_location = "s3://aws-practice-visualpath/"



In [22]:
response = athena.start_query_execution(
    QueryString=query,
    QueryExecutionContext={'Database': 'house_price'},
    ResultConfiguration={'OutputLocation': output_location}
)

query_id = response["QueryExecutionId"]
print("Query ID:", query_id)


Query ID: 608d59d5-ef4d-42fc-b566-3848ee50f3f9


11. Poll Until Query Completes

In [23]:
import time

while True:
    status = athena.get_query_execution(QueryExecutionId=query_id)
    state = status["QueryExecution"]["Status"]["State"]

    if state in ["SUCCEEDED", "FAILED", "CANCELLED"]:
        break

    print("Query running...")
    time.sleep(2)

print("Final state:", state)


Final state: SUCCEEDED


12. Get Query Results

In [24]:
if state == "SUCCEEDED":
    results = athena.get_query_results(QueryExecutionId=query_id)
    for row in results["ResultSet"]["Rows"]:
        print(row)


{'Data': [{'VarCharValue': 'price'}, {'VarCharValue': 'area'}, {'VarCharValue': 'bedrooms'}, {'VarCharValue': 'bathrooms'}, {'VarCharValue': 'stories'}, {'VarCharValue': 'mainroad'}, {'VarCharValue': 'guestroom'}, {'VarCharValue': 'basement'}, {'VarCharValue': 'hotwaterheating'}, {'VarCharValue': 'airconditioning'}, {'VarCharValue': 'parking'}, {'VarCharValue': 'prefarea'}, {'VarCharValue': 'furnishingstatus'}]}
{'Data': [{'VarCharValue': ''}, {'VarCharValue': 'price'}, {'VarCharValue': 'area'}, {'VarCharValue': 'bedrooms'}, {'VarCharValue': 'bathrooms'}, {'VarCharValue': 'stories'}, {'VarCharValue': 'mainroad'}, {'VarCharValue': 'guestroom'}, {'VarCharValue': 'basement'}, {'VarCharValue': 'hotwaterheating'}, {'VarCharValue': 'airconditioning'}, {'VarCharValue': 'parking'}, {'VarCharValue': 'prefarea'}]}
{'Data': [{'VarCharValue': '15'}, {'VarCharValue': '4.05869499602107'}, {'VarCharValue': '0.7994747830761767'}, {'VarCharValue': '2.017626274203175'}, {'VarCharValue': '0.0'}, {'VarCha