In [25]:
import pandas as pd
import numpy as np
from itertools import cycle



def generate_fake_dataframe(size, cols, col_names=None, intervals = None, seed = None):
    """
    cols values may be one of 4 values, chained in a single string (e.g. "cififficcd")
        c: used for columns with categorical variables.
        i: generates a column of integers.
        f: returns a column with floats.
        d: used for columns with date values.
        
    intervals can be specified as a list. An interval must then be specified for every col_name argument:
    
    e.g.
    
    generate_fake_dataframe(
    size = 1000, 
    cols =  "cccfd",
    col_names=["name", "store", "prod_cat", "price","purchase_date"],
    intervals = [("names",15), ("cities",5), ("products",10), (20,150),("2019-01-01","2022-12-31")]
    )
    
    You can also pass a dict to change the default intervals for all categories, e.g.:
    
    intervals = {"d" : ("1996-01-01","1996-12-31"),
             "c" : ("colors" , 7)}

    """
    
    # Set column value options for categorical vars
    categories_dict = {'products': ['top', 'jeans', 'trousers', 'shirt', 'accessories', 'dress', 'skirt', 'heels', 'sneakers', 'shoes'],
                       'names'  : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
                       'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
                       'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
                      }
    
    #Set default intervals if not otherwise specified
    default_intervals = {
        "c" : ("names", 10), 
        "i" : (0,10), 
        "f" : (0,100), 
        "d" : ("2019-01-01","2022-12-31")
    }
    
    rng = np.random.default_rng(seed)
    
    #######################################################
    
    # Handling col_names
    if isinstance(col_names,list):
        # Assert cols argument len == the col_names argument len, else fall over
        if len(col_names) != len(cols):
            return print(f"Error\nThe fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements")
    elif col_names is None:
        # Automate col_names 
        suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
        col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]
        
    #Handling intervals
    if isinstance(intervals,list):
        # Assert cols argument len == the intervals argument len, else fall over
        if len(intervals) != len(cols):
            return print(f"Error\nThe fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements")
    else:
        if isinstance(intervals,dict):
            # If intervals has been provided as a dict, take keys as a set from and assert there are as many as the default
            
            if len(set(intervals.keys()) - set(default_intervals.keys())) != 0:
                return print(f"Error\nThe intervals parameter has invalid keys")
            
            # Update the default intervals dict
            default_intervals.update(intervals)
            
        intervals = [default_intervals[col] for col in cols]
        
    #######################################################
    
    
    # Build the dataframe
    
    df = pd.DataFrame()

    for col, col_name, interval in zip(cols, col_names, intervals):
        
        if interval is None:
            interval = default_intervals[col]
            
        if (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list):
            pass
        else:
            return print(f"Error\nThe interval [{interval}] is neither a tuple of two elements nor a list of strings.")
        
        if col in ("i","f","d"):
            start, end = interval
        
        if col == "i":
            df[col_name] = rng.integers(start, end, size)
        
        elif col == "f":
            df[col_name] = rng.uniform(start, end, size)
        
        elif col == "c":
            
            if isinstance(interval, list):
                categories = np.array(interval)
            
            else:
                cat_family, length = interval
                
                if isinstance(cat_family, cycle):
                    cat_family = next(cat_family)
                
                if cat_family in categories_dict.keys():
                    pass
                else:
                    return print(f"Error\nThere are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}")
                
                categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
            
            df[col_name] = rng.choice(categories, size, shuffle = True)
        
        elif col == "d":
            df[col_name] = rng.choice(pd.date_range(start, end), size)
    
    return df   
    
    #######################################################
    

In [27]:
generate_fake_dataframe(
    size = 1000, 
    cols =  "cccfd",
    col_names=["name", "store", "prod_cat", "price","purchase_date"],
    intervals = [("names", 15), ("cities",5), ("products",10), (20,150),("2019-01-01","2022-12-31")]
)


Unnamed: 0,name,store,prod_cat,price,purchase_date
0,James,Tokyo,accessories,75.423947,2022-12-22
1,Robert,Manila,heels,36.715588,2022-12-27
2,William,Moscow,heels,38.899964,2020-02-26
3,Mary,Tokyo,accessories,113.226770,2021-02-18
4,Jennifer,Tokyo,skirt,101.679103,2021-05-29
...,...,...,...,...,...
995,John,Manila,shirt,75.423529,2019-03-16
996,Juan Luis,Bogotá,accessories,126.328938,2022-06-09
997,Jennifer,Tokyo,shoes,77.023464,2022-09-15
998,Ahmed,Madrid,skirt,76.944381,2022-05-19


In [19]:
generate_fake_dataframe(
    size = 1000, 
    cols =  "cccfd",
    col_names=["name", "store", "prod_cat", "price","purchase_date"],
    intervals = [("names",15), ("cities",5), ("products",10), (20,150),("2019-01-01","2022-12-31")]
)


Unnamed: 0,name,store,prod_cat,price,purchase_date
0,Susan,Nairobi,skirt,129.596796,2022-12-29
1,Patricia,Madrid,shirt,106.391575,2022-04-28
2,John,Nairobi,trousers,82.510929,2019-03-28
3,Juan Luis,Madrid,shirt,81.249676,2019-11-23
4,Linda,Oslo,heels,71.658234,2022-12-21
...,...,...,...,...,...
995,Juan Luis,Nairobi,shoes,46.891553,2019-04-12
996,Patricia,Moscow,jeans,119.302852,2021-08-10
997,Robert,Nairobi,accessories,132.403603,2020-08-30
998,Richard,Madrid,heels,149.034794,2021-08-26
