In [1]:
import gc
import os
import psutil
import pyarrow as pa

In [3]:
current_directory = os.getcwd()
print("Current working directory:", current_directory)

Current working directory: C:\Users\KonuTech\zoomcamp-capstone-01\notebooks


In [4]:
!ls -lah "C:\Users\KonuTech\zoomcamp-capstone-01\data"

total 42G
drwxr-xr-x 1 KonuTech 197121    0 Oct 28 21:37 .
drwxr-xr-x 1 KonuTech 197121    0 Oct 28 21:21 ..
drwxr-xr-x 1 KonuTech 197121    0 Oct 28 21:45 parquet_partitions
-rw-r--r-- 1 KonuTech 197121  60M May 20  2022 sample_submission.csv
-rw-r--r-- 1 KonuTech 197121  32G May 20  2022 test_data.csv
-rw-r--r-- 1 KonuTech 197121  16G May 20  2022 train_data.csv
-rw-r--r-- 1 KonuTech 197121 6.5G Oct 28 21:42 train_data_combined.parquet
-rw-r--r-- 1 KonuTech 197121  30M May 20  2022 train_labels.csv


In [5]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    return mem_info.rss / (1024 * 1024)  # Convert to megabytes

In [6]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [7]:
# Step 1: Read the CSV file
csv_file = 'train_data.csv'
chunk_size = 10000  # Adjust the chunk size as needed

In [8]:
# Create a directory to store Parquet partitions
parquet_dir = os.path.join(data_dir, 'parquet_partitions')
os.makedirs(parquet_dir, exist_ok=True)

In [10]:
# Step 2: Initialize an empty list to store the Parquet partition file paths
parquet_file_paths = []

In [None]:
for i, chunk in enumerate(pd.read_csv(os.path.join(data_dir, csv_file), chunksize=chunk_size)):
    # Display memory usage before reading the chunk
    before_memory = get_memory_usage()
    
    # Save the chunk as a Parquet partition
    parquet_partition_file = os.path.join(parquet_dir, f'chunk_{i}.parquet')
    chunk.to_parquet(parquet_partition_file)
    
    # Append the Parquet partition file path to the list
    parquet_file_paths.append(parquet_partition_file)
    
    # Display memory usage after reading and saving the chunk
    after_memory = get_memory_usage()
    print(f"Memory usage before chunk: {before_memory:.2f} MB")
    print(f"Memory usage after chunk: {after_memory:.2f} MB")

In [None]:
# Step 3: Combine all Parquet partitions into a single DataFrame
parquet_partitions = [pd.read_parquet(partition) for partition in parquet_file_paths]
df = pd.concat(parquet_partitions, ignore_index=True)

In [13]:
# Step 4: Convert and save the combined DataFrame as a single Parquet file
combined_parquet_file = 'train_data_combined.parquet'

In [None]:
df.to_parquet(os.path.join(data_dir, combined_parquet_file))

In [11]:
# Step 5: Remove individual Parquet partitions
for partition_file in parquet_file_paths:
    os.remove(partition_file)

In [14]:
# Step 6: EDA on the combined Parquet file
parquet_df = pd.read_parquet(os.path.join(data_dir, combined_parquet_file))  # Read the combined Parquet file

In [15]:
print(parquet_df.dtypes)
print(parquet_df.describe())

customer_ID     object
S_2             object
P_2            float64
D_39           float64
B_1            float64
                ...   
D_141          float64
D_142          float64
D_143          float64
D_144          float64
D_145          float64
Length: 190, dtype: object
                P_2          D_39           B_1           B_2           R_1  \
count  5.485466e+06  5.531451e+06  5.531451e+06  5.529435e+06  5.531451e+06   
mean   6.563340e-01  1.531172e-01  1.240100e-01  6.214887e-01  7.880270e-02   
std    2.446494e-01  2.700709e-01  2.119869e-01  4.014877e-01  2.263971e-01   
min   -4.589548e-01  5.026190e-09 -7.588799e+00  9.192280e-09  1.534223e-09   
25%    4.803307e-01  4.528464e-03  8.863645e-03  1.053313e-01  2.895934e-03   
50%    6.942950e-01  9.056902e-03  3.132968e-02  8.143328e-01  5.782230e-03   
75%    8.648159e-01  2.366407e-01  1.259019e-01  1.002403e+00  8.660590e-03   
max    1.010000e+00  5.389619e+00  1.324060e+00  1.010000e+00  3.256284e+00   

        

In [16]:
# Trigger garbage collection to clear unreferenced objects
gc.collect()

195

In [18]:
parquet_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Data columns (total 190 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo