https://www.kaggle.com/competitions/amex-default-prediction/data

In [1]:
import gc
import os
import psutil
import pyarrow as pa
import pandas as pd

In [2]:
current_directory = os.getcwd()
print("Current working directory:", current_directory)

Current working directory: C:\Users\KonuTech\zoomcamp-capstone-01\notebooks


In [3]:
!ls -lah "C:\Users\KonuTech\zoomcamp-capstone-01\data"

total 43G
drwxr-xr-x 1 KonuTech 197121    0 Oct 30 01:25 .
drwxr-xr-x 1 KonuTech 197121    0 Oct 29 19:44 ..
drwxr-xr-x 1 KonuTech 197121    0 Oct 29 22:46 parquet_partitions
-rw-r--r-- 1 KonuTech 197121  60M May 20  2022 sample_submission.csv
-rw-r--r-- 1 KonuTech 197121  32G May 20  2022 test_data.csv
-rw-r--r-- 1 KonuTech 197121  16G May 20  2022 train_data.csv
-rw-r--r-- 1 KonuTech 197121 6.7G Oct 30 01:25 train_data.csv.zip
-rw-r--r-- 1 KonuTech 197121 582M Oct 29 22:46 train_data.parquet
-rw-r--r-- 1 KonuTech 197121  30M May 20  2022 train_labels.csv
-rw-r--r-- 1 KonuTech 197121  27M Oct 29 22:42 train_labels.parquet


In [4]:
# Function to get memory usage
def get_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    return mem_info.rss / (1024 * 1024)  # Convert to megabytes

In [5]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [6]:
parquet_dir = os.path.join(data_dir, 'parquet_partitions')
parquet_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data\\parquet_partitions'

In [7]:
# List of file names to remove
files_to_remove = ['train_data.parquet', 'train_labels.parquet']
# Remove the files if they exist
for file_name in files_to_remove:
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        os.remove(file_path)

In [8]:
# List all files in the directory
file_list = os.listdir(parquet_dir)

# Loop through the files and delete them
for file_name in file_list:
    file_path = os.path.join(parquet_dir, file_name)
    if os.path.isfile(file_path):
        os.remove(file_path)

### train_labels.csv

In [9]:
# Load the CSV file (train_labels.csv)
csv_file = 'train_labels.csv'
train_labels_csv_file = os.path.join(data_dir, csv_file)
train_labels = pd.read_csv(train_labels_csv_file)

In [10]:
# Convert the DataFrame to Parquet format
parquet_file = 'train_labels.parquet'
train_labels.to_parquet(f"{data_dir}\\{parquet_file}", index=False)

In [11]:
train_labels_parquet_file = os.path.join(data_dir, parquet_file)
train_labels = pd.read_parquet(train_labels_parquet_file)

In [12]:
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


In [13]:
train_labels.describe()

Unnamed: 0,target
count,458913.0
mean,0.258934
std,0.43805
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


### train_data.csv

In [14]:
# Step 1: Read the CSV file in chunks
csv_file = 'train_data.csv'

In [15]:
# Step 2: Initialize an empty list to store the Parquet partition file paths
parquet_file_paths = []

In [16]:
chunk_size = 100000  # Adjust the chunk size as needed
i = 0  # Initialize the chunk number
cumulative_rows = 0  # Initialize the cumulative row count

if not os.path.exists(parquet_dir):
    os.makedirs(parquet_dir)


In [17]:
# Create a TextFileReader, which is iterable with chunks of 10,000 rows.
csv_iterator = pd.read_csv(os.path.join(data_dir, csv_file), iterator=True, chunksize=chunk_size)

parquet_file_paths = []  # Initialize the list to store Parquet partition file paths

In [18]:
# Iterate through the CSV file in chunks using pd.read_csv
for chunk in csv_iterator:
    # Display memory usage before reading the chunk
    before_memory = get_memory_usage()

    # Count and print the number of rows in the chunk
    num_rows = len(chunk)
    cumulative_rows += num_rows  # Accumulate the row count
    print(f"Processing chunk {i}, rows: {num_rows}, cumulative rows: {cumulative_rows}")

    # Save the chunk as a Parquet partition
    parquet_partition_file = os.path.join(parquet_dir, f'chunk_{i}.parquet')
    chunk.to_parquet(parquet_partition_file, index=False)

    # Append the Parquet partition file path to the list
    parquet_file_paths.append(parquet_partition_file)

    # Display memory usage after reading and saving the chunk
    after_memory = get_memory_usage()
    print(f"Memory usage before chunk: {before_memory:.2f} MB")
    print(f"Memory usage after chunk: {after_memory:.2f} MB")

    i += 1  # Increment the chunk number

Processing chunk 0, rows: 100000, cumulative rows: 100000
Memory usage before chunk: 431.88 MB
Memory usage after chunk: 454.71 MB
Processing chunk 1, rows: 100000, cumulative rows: 200000
Memory usage before chunk: 456.16 MB
Memory usage after chunk: 472.36 MB
Processing chunk 2, rows: 100000, cumulative rows: 300000
Memory usage before chunk: 473.16 MB
Memory usage after chunk: 490.67 MB
Processing chunk 3, rows: 100000, cumulative rows: 400000
Memory usage before chunk: 488.58 MB
Memory usage after chunk: 507.42 MB
Processing chunk 4, rows: 100000, cumulative rows: 500000
Memory usage before chunk: 507.23 MB
Memory usage after chunk: 493.61 MB
Processing chunk 5, rows: 100000, cumulative rows: 600000
Memory usage before chunk: 492.80 MB
Memory usage after chunk: 509.95 MB
Processing chunk 6, rows: 100000, cumulative rows: 700000
Memory usage before chunk: 507.25 MB
Memory usage after chunk: 525.20 MB
Processing chunk 7, rows: 100000, cumulative rows: 800000
Memory usage before chunk

In [19]:
# Concatenate the Parquet partitions into a single DataFrame
parquet_partitions = [pd.read_parquet(partition) for partition in parquet_file_paths]
df = pd.concat(parquet_partitions, ignore_index=True)

In [None]:
# Deduplicate the DataFrame by the "customer_ID" field
# df = df.drop_duplicates(["customer_ID"])
df = df.drop_duplicates()

In [None]:
# Step 4: Convert and save the combined DataFrame as a single Parquet file
combined_parquet_file = 'train_data.parquet'

In [None]:
df.to_parquet(os.path.join(data_dir, combined_parquet_file))

In [None]:
# Step 5: Remove individual Parquet partitions
for partition_file in parquet_file_paths:
    os.remove(partition_file)

In [None]:
# Step 6: EDA on the combined Parquet file
parquet_df = pd.read_parquet(os.path.join(data_dir, combined_parquet_file))  # Read the combined Parquet file

In [None]:
parquet_df.info(verbose=True)

In [None]:
parquet_df["customer_ID"].value_counts()

In [None]:
parquet_df.describe()

In [None]:
# Trigger garbage collection to clear unreferenced objects
gc.collect()