In [112]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm
import random
import string


In [113]:
def simulate_numerical_column(p_data, column_name, mean, sd, decimal_places, noise, distribution,low = None, upp = None, N=100):
    n = p_data.shape[0]
    if distribution == "Normal":
        rvs_object = truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)
    elif distribution == "Uniform":
        rvs_object = uniform(loc=mean, scale=sd)
    elif distribution == "Beta":
        rvs_object = beta(a=low, b=upp, loc=mean, scale=sd) 
    elif distribution == "Gamma":
        rvs_object = gamma(a=low, loc=mean, scale=sd)
    elif distribution == "Alpha":
        rvs_object = alpha(a=low, loc=mean, scale=sd)
    else: 
        print("Warning: Distribution is not supported, Normal distribution is the default")
        rvs_object = truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd) 
        
    values = np.zeros((n))
    col_data = []
    for i in range(N):
        x = np.array(rvs_object.rvs(n))
        values += x
        col_data.append(x)
        
    simulated_column_data = values/N 
    col_data.append(simulated_column_data)
    
    if noise > low:
        print("Warning: Noise is too large.")
    if noise == 0:
        noise_array = np.zeros((n))
    else:
        noise_array = np.random.uniform(-noise, noise, (n))
        
    simulated_noised_column = simulated_column_data + noise_array
    sn_rounded = np.round(simulated_noised_column, decimal_places)
    p_data[column_name] = sn_rounded.tolist()
    return p_data 

def simulate_categorical_column(p_data, column_name, class_given):
    n = p_data.shape[0]
    simulated_column = []
    for i in range(n):
        simulated_column.append(class_given)
    p_data[column_name] = simulated_column
    return p_data




In [114]:
data = pd.read_csv("users.csv")


In [115]:
df = simulate_numerical_column(data, "favourites_count", mean = 500, sd = 1, decimal_places = 2, noise = 0.1, distribution = "Power Normal",low = 100, upp = 1000)
pd.DataFrame(df).rename(columns={'favourites_count': 'Average Time Spend(mins)'}, inplace=True)

df = simulate_numerical_column(data, "statuses_count", mean = 15000, sd = 1, decimal_places = 2, noise = 0.1, distribution = "Power Normal",low = 10, upp = 50000)
pd.DataFrame(df).rename(columns={'statuses_count': 'Average Money Spend($)'}, inplace=True)

df = simulate_numerical_column(data, "followers_count", mean = 15000, sd = 1, decimal_places = 2, noise = 0.1, distribution = "Power Normal",low = 10, upp = 50000)
pd.DataFrame(df).rename(columns={'followers_count': 'Available balance($)'}, inplace=True)

df = df.drop(['screen_name', 'friends_count', 'listed_count', 'url','default_profile', 'profile_banner_url', 'default_profile_image', 'profile_image_url', 'profile_use_background_image', 'profile_background_image_url_https', 'profile_image_url_https', 'profile_sidebar_border_color', 'profile_background_tile', 'profile_sidebar_fill_color', 'profile_background_image_url','profile_background_color', 'profile_link_color', 'description', 'updated','dataset', 'geo_enabled', 'profile_text_color'], axis=1)




In [116]:
def random_dates(start, end, size):
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

def random_genders(size, p=None):
    if not p:
        # default probabilities
        p = (0.49, 0.49, 0.01, 0.01)
    gender = ("M", "F", "O", "")
    return np.random.choice(gender, size=size, p=p)



In [117]:
df['Birthdate'] = random_dates(start=pd.to_datetime('1960-01-01'), end=pd.to_datetime('2012-01-01'), size=size)
df['Gender'] = random_genders(size)
df

Unnamed: 0,id,name,Average Money Spend($),Available balance($),Average Time Spend(mins),created_at,lang,time_zone,location,utc_offset,protected,verified,Birthdate,Gender
0,3610511,Davide Dellacasa,14999.95,15000.02,499.89,Fri Apr 06 10:58:22 +0000 2007,it,Rome,Roma,3600.0,,,1995-01-07,M
1,5656162,Simone Economo,15000.06,14999.81,500.04,Mon Apr 30 15:08:42 +0000 2007,en,Rome,"Rome, Italy",3600.0,,,1992-08-01,F
2,5682702,tacone,14999.88,14999.98,500.04,Tue May 01 11:53:40 +0000 2007,en,Rome,Internets,3600.0,,,2011-01-28,M
3,6067292,alesaura,14999.82,15000.05,499.92,Tue May 15 16:55:16 +0000 2007,en,Rome,,3600.0,,,2007-04-04,M
4,6015122,Angelo,15000.01,15000.00,499.96,Sun May 13 19:52:00 +0000 2007,it,Rome,"iPhone: 44.069630,12.569966",3600.0,,,1988-01-13,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1476,1127280169,Frilaif Official,15000.06,14999.96,500.19,Mon Jan 28 07:13:47 +0000 2013,it,,World Wide Mind,,,,1974-02-19,F
1477,1156344000,♔♥HeartsQueen♥♔,15000.13,15000.25,500.05,Thu Feb 07 07:27:08 +0000 2013,it,,,,,,2011-06-16,M
1478,1169114810,angelagervasi,14999.92,15000.17,500.04,Mon Feb 11 14:50:19 +0000 2013,it,,,,,,1970-10-06,M
1479,1212975186,Midnight,15000.04,15000.17,500.10,Sat Feb 23 17:46:55 +0000 2013,it,,,,,,1987-04-06,M
