# CT1-MLOps Course Group Assignment


In [1]:
import pandas as pd
import numpy as np

In [5]:
# Load dataset
data = pd.read_excel(r'C:\Users\neeti\Documents\ISB_Class of Summer_2025\02 Term 2\CT 1\Real estate valuation data set.xlsx')

In [7]:
data.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [8]:
data.shape

(414, 8)

## 2. Dataset Schema and Storage

In [22]:
import pyarrow as pa
import pyarrow.parquet as pq

# Define schema
schema = pa.schema([
    pa.field("No", pa.int32(), nullable=False),
    pa.field("X1 transaction date", pa.float64(), nullable=True),
    pa.field("X2 house age", pa.float32(), nullable=True),
    pa.field("X3 distance to the nearest MRT station", pa.float64(), nullable=True),
    pa.field("X4 number of convenience stores", pa.int32(), nullable=False),
    pa.field("X5 latitude", pa.float64(), nullable=False),
    pa.field("X6 longitude", pa.float64(), nullable=False),
    pa.field("Y house price of unit area", pa.float64(), nullable=False),
])

# Convert to PyArrow table with schema
table = pa.Table.from_pandas(data, schema=schema)

# Save to Parquet
output_file = r"C:\Users\neeti\Documents\ISB_Class of Summer_2025\02 Term 2\CT 1\real_estate_full_dataset.parquet"
pq.write_table(table, output_file)

print(f"Full dataset saved in Parquet format at '{output_file}'.")


Full dataset saved in Parquet format at 'C:\Users\neeti\Documents\ISB_Class of Summer_2025\02 Term 2\CT 1\real_estate_full_dataset.parquet'.


In [23]:
import pandas as pd

# Load the Parquet file
parquet_file = "real_estate_full_dataset.parquet"
df = pd.read_parquet(parquet_file)

# Display the first few rows
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


## 3. Profiling the Dataset

In [24]:
pip install ydata-profiling

Note: you may need to restart the kernel to use updated packages.


In [25]:
from ydata_profiling import ProfileReport

In [26]:
# Load the Parquet dataset
parquet_file = "real_estate_full_dataset.parquet"
df = pd.read_parquet(parquet_file)

# Generate the profile report
profile = ProfileReport(df, title="Real Estate Dataset Profile (Parquet)", explorative=True)

# Save the report to an HTML file
output_file = r"C:\Users\neeti\Documents\ISB_Class of Summer_2025\02 Term 2\CT 1\real_estate_parquet_profile.html"
profile.to_file(output_file)

print(f"Profile report generated and saved at '{output_file}'.")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8
 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
 9 9 9 9 9 9 9]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  discretized_df.loc[:, column] = self._discretize_column(
 0 5 4 0 0 4 5 3 7 8 2 0 0 3 0 2 4 4 0 7 6 0 4 1 5 2 3 0 7 5 4 5 4 8 2 7 0
 9 0 2 2 1 0

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profile report generated and saved at 'C:\Users\neeti\Documents\ISB_Class of Summer_2025\02 Term 2\CT 1\real_estate_parquet_profile.html'.


In [27]:
import os
print(os.getcwd())

C:\Users\neeti\Documents\ISB_Class of Summer_2025\02 Term 2\MLSL2
