In [1]:
import pandas as pd
import warnings
import sys
sys.path.append('../src')
warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning)
import matplotlib.pyplot as plt
print("Package Imported")

Package Imported


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

def read_multiple_csv_files(file_paths: list, chunk_size: int = 100000) -> pd.DataFrame:
    """
    Reads multiple CSV files into one pandas DataFrame in chunks to avoid memory overload.

    Parameters:
    - file_paths (list): List of file paths to the CSV files.
    - chunk_size (int): The number of rows to read at a time from each CSV file.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing the data from all the CSV files, concatenated.
    """
    # Initialize an empty list to hold the DataFrames
    dfs = []

    # Loop through each file path
    for file_path in file_paths:
        # Read the file in chunks
        chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)

        # Process each chunk
        for chunk in chunk_iter:
            # Append the chunk to the list
            dfs.append(chunk)
        print(f"{file_path} finished")

    # Concatenate all the chunks into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)


    return combined_df


In [4]:
file_paths = [
            '/content/drive/MyDrive/cleaned/2015Q1.csv/part-00000-3cc23a57-3732-4fa2-a6e4-a43cece1f65a-c000.csv',
            '/content/drive/MyDrive/cleaned/2015Q3.csv/part-00000-9ecb8a62-7c5d-4705-b23f-65541eb8d458-c000.csv',
            '/content/drive/MyDrive/cleaned/2016Q1.csv/part-00000-0115a5ba-1db7-421c-866c-7f1446998bf2-c000.csv',
            '/content/drive/MyDrive/cleaned/2016Q3.csv/part-00000-08a86052-5429-46ab-8fb9-69b0411fd0ab-c000.csv'
              ]

df = read_multiple_csv_files(file_paths)

/content/drive/MyDrive/cleaned/2015Q1.csv/part-00000-3cc23a57-3732-4fa2-a6e4-a43cece1f65a-c000.csv finished
/content/drive/MyDrive/cleaned/2015Q3.csv/part-00000-9ecb8a62-7c5d-4705-b23f-65541eb8d458-c000.csv finished
/content/drive/MyDrive/cleaned/2016Q1.csv/part-00000-0115a5ba-1db7-421c-866c-7f1446998bf2-c000.csv finished
/content/drive/MyDrive/cleaned/2016Q3.csv/part-00000-08a86052-5429-46ab-8fb9-69b0411fd0ab-c000.csv finished


In [5]:
def plot_y_label_vs_date(df: pd.DataFrame, y_label_column: str, date_column: str, max_points: int = 1000):
    """
    This function plots a scatter plot of the rows where y_label == 1,
    with the x-axis representing the ACT_PERIOD (date-time) and the y-axis representing the y_label.

    Parameters:
    - df: pandas DataFrame containing the data.
    - y_label_column: The column representing the y_label (e.g., 'y_label').
    - date_column: The column representing the date-time (e.g., 'ACT_PERIOD').
    - max_points: Maximum number of points to plot (default 1000).

    The function will:
    - Filter the rows where y_label == 1.
    - Plot a scatter plot with ACT_PERIOD on the x-axis and y_label on the y-axis.
    """
    # Ensure that the date column is in datetime format
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')

    # Filter the DataFrame to include only rows where y_label == 1
    df_filtered = df[df[y_label_column] == 1]

    # Sample a subset if the number of rows is too large
    if len(df_filtered) > max_points:
        df_filtered = df_filtered.sample(n=max_points, random_state=42)

    # Plot the scatter plot with ACT_PERIOD on the x-axis and y_label on the y-axis
    plt.figure(figsize=(10, 6))
    plt.scatter(df_filtered[date_column], df_filtered[y_label_column], color='skyblue', marker='o', label='y_label = 1')

    # Customize the plot
    plt.xlabel('ACT_PERIOD')
    plt.ylabel('y_label')
    plt.title('Scatter Plot of y_label = 1 vs. ACT_PERIOD')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.legend()
    plt.show()

# Example usage:
# Assuming your DataFrame is `df` and contains 'y_label' and 'ACT_PERIOD' columns
# plot_y_label_vs_date(df, 'y_label', 'ACT_PERIOD')


In [6]:
# df['ACT_PERIOD'] = pd.to_datetime(df['ACT_PERIOD'], errors='coerce')

# # Find the max (most recent) and min (earliest) ACT_PERIOD
# max_act_period = df['ACT_PERIOD'].max()
# min_act_period = df['ACT_PERIOD'].min()

# # Print the results
# print("Max ACT_PERIOD:", max_act_period)
# print("Min ACT_PERIOD:", min_act_period)

In [7]:
df.dropna(inplace=True)
y_label_distribution = df['y_label'].value_counts()

# Print the result
print(y_label_distribution)

y_label
0    121217201
1       686095
Name: count, dtype: int64


In [9]:
df['ACT_PERIOD'] = pd.to_datetime(df['ACT_PERIOD'], errors='coerce')
train_start_date = pd.to_datetime('2015-01-01')
train_end_date = pd.to_datetime('2020-01-01')
test_start_date = pd.to_datetime('2022-01-01')
test_end_date = pd.to_datetime('2024-01-01')
train_data = df[(df['ACT_PERIOD'] >= train_start_date) & (df['ACT_PERIOD'] < train_end_date)]
test_data = df[(df['ACT_PERIOD'] >= test_start_date) & (df['ACT_PERIOD'] < test_end_date)]


In [15]:
train_data.to_csv('/content/train_data.csv', index=False)

# Output test_data to CSV
test_data.to_csv('/content/test_data.csv', index=False)

In [13]:
y_label_distribution = train_data['y_label'].value_counts()
print(y_label_distribution)

y_label
0    81563982
1      166487
Name: count, dtype: int64


In [14]:
y_label_distribution = test_data['y_label'].value_counts()
print(y_label_distribution)

y_label
0    13981462
1       92839
Name: count, dtype: int64
