In [1]:
import time
from IPython.utils import io
import pandas as pd
import numpy as np

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('people.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
0,1,4defE49671cF860,Sydney,Shannon,Male,tvang@example.net,574-440-1423x9799,2020-07-09,Technical brewer
1,2,F89B87bCf8f210b,Regina,Lin,Male,helen14@example.net,001-273-664-2268x90121,1909-06-20,"Teacher, adult education"
2,3,Cad6052BDd5DEaf,Pamela,Blake,Female,brent05@example.org,927-880-5785x85266,1964-08-19,Armed forces operational officer
3,4,e83E46f80f629CD,Dave,Hoffman,Female,munozcraig@example.org,001-147-429-8340x608,2009-02-19,Ship broker
4,5,60AAc4DcaBcE3b6,Ian,Campos,Female,brownevelyn@example.net,166-126-4390,1997-10-02,Media planner


In [2]:
# Renaming columns

df = df.rename(columns={'User Id':'User_Id','First Name':'First_Name','Last Name':'Last_Name',
                        'Sex':'Gender','Date of birth': 'DOB','Job Title':'Job_Title'})


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 9 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Index       int64 
 1   User_Id     object
 2   First_Name  object
 3   Last_Name   object
 4   Gender      object
 5   Email       object
 6   Phone       object
 7   DOB         object
 8   Job_Title   object
dtypes: int64(1), object(8)
memory usage: 137.3+ MB


In [4]:
start_time = time.time()

def increasing_people_csv(df, target_size):
  
    original_size = len(df)
    additional_copies_needed = target_size // original_size - 1  # Subtract 1 because we already have the original df
    
    # Creating a list to hold the original DataFrame and all additional copies needed
    dfs = [df]
    
    for _ in range(additional_copies_needed):
        dfs.append(df.copy())
    
    # Concatenate all DataFrames in the list to reach the target size
    grown_df = pd.concat(dfs, ignore_index=True)
    
    # In case the exact multiplication results in a fraction and we need more rows,
    # we add the extra rows needed to match the target size.
    extra_rows_needed = target_size - len(grown_df)
    if extra_rows_needed > 0:
        grown_df = pd.concat([grown_df, df.iloc[:extra_rows_needed]], ignore_index=True)
    
    return grown_df

# Here we specify the amount of rows we need 
target_size = 14_000_000

# Grow the dataset to the exact target size
grown_df = increasing_people_csv(df, target_size)

end_time = time.time()

df = grown_df.copy()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000000 entries, 0 to 13999999
Data columns (total 9 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Index       int64 
 1   User_Id     object
 2   First_Name  object
 3   Last_Name   object
 4   Gender      object
 5   Email       object
 6   Phone       object
 7   DOB         object
 8   Job_Title   object
dtypes: int64(1), object(8)
memory usage: 961.3+ MB


In [5]:
execution_time = end_time - start_time

# Convert execution time to minutes and seconds
minutes = int(execution_time // 60)
seconds = int(execution_time % 60)

print(f"Total execution time: {minutes} minutes {seconds} seconds")

Total execution time: 0 minutes 3 seconds


In [6]:
start_time = time.time()

# Export the DataFrame to a CSV file
grown_df.to_csv('people_increased.csv', index=False)
end_time = time.time()
execution_time = end_time - start_time

# Convert execution time to minutes and seconds
minutes = int(execution_time // 60)
seconds = int(execution_time % 60)

print(f"Total execution time: {minutes} minutes {seconds} seconds")

Total execution time: 0 minutes 33 seconds
