In [1]:
import gc
import os
import psutil
import pyarrow as pa
import time

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random

In [3]:
current_directory = os.getcwd()
print("Current working directory:", current_directory)

Current working directory: C:\Users\KonuTech\zoomcamp-capstone-01\notebooks


In [4]:
!ls -lah "C:\Users\KonuTech\zoomcamp-capstone-01\data"

total 42G
drwxr-xr-x 1 KonuTech 197121    0 Oct 28 21:37 .
drwxr-xr-x 1 KonuTech 197121    0 Oct 28 22:45 ..
drwxr-xr-x 1 KonuTech 197121    0 Oct 28 21:45 parquet_partitions
-rw-r--r-- 1 KonuTech 197121  60M May 20  2022 sample_submission.csv
-rw-r--r-- 1 KonuTech 197121  32G May 20  2022 test_data.csv
-rw-r--r-- 1 KonuTech 197121  16G May 20  2022 train_data.csv
-rw-r--r-- 1 KonuTech 197121 6.5G Oct 28 21:42 train_data_combined.parquet
-rw-r--r-- 1 KonuTech 197121  30M May 20  2022 train_labels.csv


In [5]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [6]:
# Step 4: Convert and save the combined DataFrame as a single Parquet file
combined_parquet_file = 'train_data_combined.parquet'

In [7]:
# Step 6: EDA on the combined Parquet file
parquet_df = pd.read_parquet(os.path.join(data_dir, combined_parquet_file))  # Read the combined Parquet file

In [8]:
# print(parquet_df.dtypes)
# print(parquet_df.describe())

In [9]:
# Trigger garbage collection to clear unreferenced objects
gc.collect()

36

In [10]:
parquet_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Data columns (total 190 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo

In [11]:
def save_histograms(dataframe, save_path):

    start_time = time.time()
    
    numeric_columns = dataframe.select_dtypes(include=['number'])
    
    # Create the directory if it doesn't exist
    os.makedirs(save_path, exist_ok=True)
    
    for column in numeric_columns.columns:
        plt.figure(figsize=(8, 6))
        
        sns.histplot(dataframe[column], kde=True, color='blue', bins=20)
        plt.title(f'Histogram with KDE of {column}')
        plt.xlabel(column)
        plt.ylabel('Density')
        
        # Calculate the number of events (N) and format it with thousands separators
        num_events = len(dataframe[column])
        formatted_num_events = '{:,}'.format(num_events)
        
        # Add annotation for N in the top-middle part of the histogram
        plt.text(0.5, 0.95, f'N={formatted_num_events}', fontsize=12, ha='center', transform=plt.gca().transAxes, bbox=dict(facecolor='white', alpha=0.7))
        
        # Save the histogram as a .jpg file with the column name
        file_name = f'{column}.jpg'
        file_path = os.path.join(save_path, file_name)
        plt.savefig(file_path)
        
        print(f'Saved: {file_name}')
        
        plt.close()  # Close the plot to avoid displaying in the notebook

        # Print time elapsed for each whole minute
        elapsed_time = time.time() - start_time
        if int(elapsed_time) % 60 == 0 or num_events == len(dataframe[column]) - 1:
            print(f'Time elapsed: {int(elapsed_time / 60)} minutes')

In [12]:
# Define the directory path
save_path = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'eda', "histograms")
save_path

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\eda\\histograms'

In [13]:
save_histograms(parquet_df, save_path)

Saved: P_2.jpg
Saved: D_39.jpg
Saved: B_1.jpg
Saved: B_2.jpg
Saved: R_1.jpg
Saved: S_3.jpg
Saved: D_41.jpg
Saved: B_3.jpg
Saved: D_42.jpg
Saved: D_43.jpg
Saved: D_44.jpg
Saved: B_4.jpg
Saved: D_45.jpg
Saved: B_5.jpg
Saved: R_2.jpg
Saved: D_46.jpg
Saved: D_47.jpg
Saved: D_48.jpg
Saved: D_49.jpg
Saved: B_6.jpg
Saved: B_7.jpg
Saved: B_8.jpg
Saved: D_50.jpg
Saved: D_51.jpg
Saved: B_9.jpg
Saved: R_3.jpg
Saved: D_52.jpg
Saved: P_3.jpg
Saved: B_10.jpg
Saved: D_53.jpg
Saved: S_5.jpg
Saved: B_11.jpg
Saved: S_6.jpg
Saved: D_54.jpg
Saved: R_4.jpg
Saved: S_7.jpg
Saved: B_12.jpg
Saved: S_8.jpg
Saved: D_55.jpg
Saved: D_56.jpg
Saved: B_13.jpg
Saved: R_5.jpg
Saved: D_58.jpg
Saved: S_9.jpg
Saved: B_14.jpg
Saved: D_59.jpg
Saved: D_60.jpg
Saved: D_61.jpg
Saved: B_15.jpg
Saved: S_11.jpg
Saved: D_62.jpg
Saved: D_65.jpg
Saved: B_16.jpg
Saved: B_17.jpg
Saved: B_18.jpg
Saved: B_19.jpg
Saved: D_66.jpg
Saved: B_20.jpg
Saved: D_68.jpg
Saved: S_12.jpg
Saved: R_6.jpg
Time elapsed: 13 minutes
Saved: S_13.jpg
Saved: