In [5]:
import pandas as pd
import os

# Load the original CSV file
file_path = '/content/3.6M-Japan-lifebear.com-Largest-Notebook-App-UsersDB-csv-2019.csv'

try:
    # Step 1: Load the data into a DataFrame
    df = pd.read_csv(file_path, delimiter=';')  # Adjust delimiter if necessary
    print("Original data loaded successfully.")
    print("First few rows before renaming and formatting:")
    print(df.head())

    # Step 2: Rename the specified columns
    df = df.rename(columns={
        'id': 'id_num',
        'login_id': 'user_name',
        'mail_address': 'email_address',
        'birthday_on': 'DOB',
        'created_at': 'start_date'
    })

    # Step 3: Format 'start_date' to only have time (HH-MM-SS)
    if 'start_date' in df.columns:
        df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce').dt.strftime('%H-%M-%S')
        df['start_date'] = df['start_date'].fillna('NaN')  # Fill unconvertible values with 'NaN'
        print("Problematic 'start_date' values have been filled with 'NaN'.")

    # Step 4: Save the updated DataFrame to a new CSV file
    output_file_path = '/content/step_1.csv'
    if os.path.exists(output_file_path):
        print(f"Warning: {output_file_path} already exists and will be overwritten.")

    df.to_csv(output_file_path, index=False)
    print(f"Data successfully saved to {output_file_path}.")

except FileNotFoundError:
    print(f"File not found at the specified path: {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")



  df = pd.read_csv(file_path, delimiter=';')  # Adjust delimiter if necessary


Original data loaded successfully.
First few rows before renaming and formatting:
   id    login_id              mail_address                          password  \
0   1    sugimoto   sugimoto@lifebear.co.jp  f0bac04aa1b45cf443d722d6f71c0250   
1   2         kou  nakanishi@lifebear.co.jp  48207c322ee5bb156ffec9f08c960aaa   
2   3      yusuke     yuozawa1208@gmail.com  048261a8024ce51d379eb53cc51aaf33   
3   4  entyan1106        endo1106@gmail.com  cd77a9dac26260a104facda5665eb3ab   
4   5      kuriki          kuriki@wavy4.com  a026597c294cc48cd20ae361f10cbab1   

            created_at          salt birthday_on  gender  
0  2012-01-13 22:54:05  yGwBKynnsctI  1984-11-09     0.0  
1  2012-01-14 12:48:31  aha6EuRYCDvU  1986-11-13     0.0  
2  2012-01-17 15:33:22  PVS59dPWk9BH  1984-12-08     0.0  
3  2012-01-17 15:37:02  vLZI6TVCJowN  1987-11-06     0.0  
4  2012-01-17 18:52:32  swFznWWk79fg  1986-10-21     0.0  
Problematic 'start_date' values have been filled with 'NaN'.
Data successfull

In [6]:
import math

# Step 1: Load the cleaned data from 'step_1.csv'
cleaned_file_path = '/content/step_1.csv'

try:
    df = pd.read_csv(cleaned_file_path)
    print("Cleaned data loaded successfully.")
    print("First few rows of the cleaned data:")
    print(df.head())

    # Step 2: Calculate the number of rows per chunk
    total_rows = len(df)
    num_chunks = 5
    chunk_size = math.ceil(total_rows / num_chunks)

    # Step 3: Split the DataFrame into 5 chunks
    for i in range(num_chunks):
        start_index = i * chunk_size
        end_index = (i + 1) * chunk_size if (i + 1) * chunk_size < total_rows else total_rows

        # Get the chunk
        chunk = df.iloc[start_index:end_index]

        # Step 4: Save the chunk to a new CSV file
        chunk_file_path = f'/content/lifebear_dataset_chunk_{i + 1}.csv'
        chunk.to_csv(chunk_file_path, index=False)
        print(f"Chunk {i + 1} saved to '{chunk_file_path}'")

    print("All chunks processed and saved successfully.")

except FileNotFoundError:
    print(f"File not found at the specified path: {cleaned_file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


  df = pd.read_csv(cleaned_file_path)


Cleaned data loaded successfully.
First few rows of the cleaned data:
   id_num   user_name             email_address  \
0       1    sugimoto   sugimoto@lifebear.co.jp   
1       2         kou  nakanishi@lifebear.co.jp   
2       3      yusuke     yuozawa1208@gmail.com   
3       4  entyan1106        endo1106@gmail.com   
4       5      kuriki          kuriki@wavy4.com   

                           password start_date          salt         DOB  \
0  f0bac04aa1b45cf443d722d6f71c0250   22-54-05  yGwBKynnsctI  1984-11-09   
1  48207c322ee5bb156ffec9f08c960aaa   12-48-31  aha6EuRYCDvU  1986-11-13   
2  048261a8024ce51d379eb53cc51aaf33   15-33-22  PVS59dPWk9BH  1984-12-08   
3  cd77a9dac26260a104facda5665eb3ab   15-37-02  vLZI6TVCJowN  1987-11-06   
4  a026597c294cc48cd20ae361f10cbab1   18-52-32  swFznWWk79fg  1986-10-21   

   gender  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  
Chunk 1 saved to '/content/lifebear_dataset_chunk_1.csv'
Chunk 2 saved to '/content/lifebear_d

In [7]:
# Step 1: Load the cleaned data from 'step_1.csv'
cleaned_file_path = '/content/step_1.csv'

try:
    # Load the cleaned data
    df = pd.read_csv(cleaned_file_path)
    print("Cleaned data loaded successfully.")
    print("First few rows of the cleaned data:")
    print(df.head())

    # Step 2: Calculate the number of rows per chunk
    total_rows = len(df)
    num_chunks = 5
    chunk_size = math.ceil(total_rows / num_chunks)

    # Initialize a DataFrame to collect discrepancies
    garbage_df = pd.DataFrame()

    # Step 3: Split the DataFrame into 5 chunks
    for i in range(num_chunks):
        start_index = i * chunk_size
        end_index = (i + 1) * chunk_size if (i + 1) * chunk_size < total_rows else total_rows

        # Get the chunk
        chunk = df.iloc[start_index:end_index]

        # Step 4: Identify rows with missing values or discrepancies in the chunk
        # We will assume that any row with missing values is considered a discrepancy
        discrepancies = chunk[chunk.isnull().any(axis=1)]

        # If discrepancies are found, add them to the garbage DataFrame and remove from the current chunk
        if not discrepancies.empty:
            print(f"Discrepancies found in chunk {i + 1}")
            garbage_df = pd.concat([garbage_df, discrepancies])
            chunk = chunk.drop(discrepancies.index)

        # Step 5: Save the cleaned chunk to a new CSV file
        chunk_file_path = f'/content/lifebear_dataset_chunk_{i + 1}.csv'
        chunk.to_csv(chunk_file_path, index=False)
        print(f"Chunk {i + 1} saved to '{chunk_file_path}'")

    # Step 6: Save the discrepancies to 'garbage.csv' if any were found
    garbage_file_path = '/content/garbage.csv'
    if not garbage_df.empty:
        garbage_df.to_csv(garbage_file_path, index=False)
        print(f"Discrepancies saved to '{garbage_file_path}'")
    else:
        print("No discrepancies found in any of the chunks.")

    print("All chunks processed and saved successfully.")

except FileNotFoundError:
    print(f"File not found at the specified path: {cleaned_file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


  df = pd.read_csv(cleaned_file_path)


Cleaned data loaded successfully.
First few rows of the cleaned data:
   id_num   user_name             email_address  \
0       1    sugimoto   sugimoto@lifebear.co.jp   
1       2         kou  nakanishi@lifebear.co.jp   
2       3      yusuke     yuozawa1208@gmail.com   
3       4  entyan1106        endo1106@gmail.com   
4       5      kuriki          kuriki@wavy4.com   

                           password start_date          salt         DOB  \
0  f0bac04aa1b45cf443d722d6f71c0250   22-54-05  yGwBKynnsctI  1984-11-09   
1  48207c322ee5bb156ffec9f08c960aaa   12-48-31  aha6EuRYCDvU  1986-11-13   
2  048261a8024ce51d379eb53cc51aaf33   15-33-22  PVS59dPWk9BH  1984-12-08   
3  cd77a9dac26260a104facda5665eb3ab   15-37-02  vLZI6TVCJowN  1987-11-06   
4  a026597c294cc48cd20ae361f10cbab1   18-52-32  swFznWWk79fg  1986-10-21   

   gender  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  
Discrepancies found in chunk 1
Chunk 1 saved to '/content/lifebear_dataset_chunk_1.csv'
Discre

In [8]:
# List of chunk file paths
chunk_files = [
    '/content/lifebear_dataset_chunk_1.csv',
    '/content/lifebear_dataset_chunk_2.csv',
    '/content/lifebear_dataset_chunk_3.csv',
    '/content/lifebear_dataset_chunk_4.csv',
    '/content/lifebear_dataset_chunk_5.csv'
]

try:
    # Step 1: Initialize an empty list to collect DataFrames
    dataframes = []

    # Step 2: Load each chunk and append it to the list
    for chunk_file in chunk_files:
        if os.path.exists(chunk_file):
            df = pd.read_csv(chunk_file)
            dataframes.append(df)
            print(f"{chunk_file} loaded successfully.")
        else:
            print(f"Warning: {chunk_file} not found and will be skipped.")

    # Step 3: Filter out empty or all-NA DataFrames
    filtered_dataframes = [
        df for df in dataframes if not df.empty and not df.isnull().all().all()
    ]

    # Step 4: Concatenate all filtered DataFrames into a single DataFrame
    if filtered_dataframes:
        merged_df = pd.concat(filtered_dataframes, ignore_index=True)
        print("All chunks merged successfully.")

        # Step 5: Save the merged DataFrame to 'clean_lifebear.csv'
        output_file_path = '/content/clean_lifebear.csv'
        merged_df.to_csv(output_file_path, index=False)
        print(f"Data successfully saved to {output_file_path}.")
    else:
        print("No valid chunk files were loaded, so the merging process could not be completed.")

except Exception as e:
    print(f"An error occurred: {e}")


/content/lifebear_dataset_chunk_1.csv loaded successfully.
/content/lifebear_dataset_chunk_2.csv loaded successfully.
/content/lifebear_dataset_chunk_3.csv loaded successfully.
/content/lifebear_dataset_chunk_4.csv loaded successfully.
/content/lifebear_dataset_chunk_5.csv loaded successfully.
All chunks merged successfully.
Data successfully saved to /content/clean_lifebear.csv.
