Basado en: 

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

#Leer datos

In [21]:
#Leer datos
df = pd.read_csv('Walmart_Sales_Data.csv'')
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [22]:
#Eliminar las columnas no importantes
df.drop(['Date', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment'], axis=1, inplace=True)
df.head()

Unnamed: 0,Store,Weekly_Sales
0,1,1643690.9
1,1,1641957.44
2,1,1611968.17
3,1,1409727.59
4,1,1554806.68


In [23]:
#Convertir el dataset a estructura de columnas
df['idx'] = df.groupby('Store').cumcount()
df = df.pivot(index = 'idx', columns='Store', values='Weekly_Sales')
df.head()

Store,1,2,3,4,5,6,7,8,9,10,...,36,37,38,39,40,41,42,43,44,45
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1643690.9,2136989.46,461622.22,2135143.87,317173.1,1652635.1,496725.44,1004137.09,549505.55,2193048.75,...,467546.74,536006.73,358496.14,1230596.8,1001943.8,1086533.18,543384.01,647029.28,281090.95,890689.51
1,1641957.44,2137809.5,420728.96,2188307.39,311825.7,1606283.86,524104.92,994801.4,552677.48,2176028.52,...,469563.7,529852.7,342214.9,1266229.07,955338.29,1075656.34,575709.96,682918.99,286857.13,656988.64
2,1611968.17,2124451.54,421642.19,2049860.26,303447.57,1567138.07,506760.54,963960.37,511327.9,2113432.58,...,470281.03,510382.5,327237.92,1230591.97,916289.2,1052034.74,508794.87,658997.55,267956.3,841264.04
3,1409727.59,1865097.27,407204.86,1925728.84,270281.63,1432953.21,496083.24,847592.11,473773.27,2006774.96,...,447519.44,513615.82,334222.73,1168582.02,863917.41,991941.73,491510.58,618702.79,273079.07,741891.65
4,1554806.68,1991013.13,415202.04,1971057.44,288855.71,1601348.82,491419.55,881503.95,507297.88,1987090.09,...,480203.43,519255.68,372239.89,1266254.21,990152.28,1063557.49,554972.42,658600.05,284617.27,777951.22


#Funciones para generar datos sinteticos

In [24]:
def frequency_table(data, bins=5):
  ax = plt.hist(data, bins=bins)
  plt.close()
  freqs = ax[0]
  intervals = ax[1]

  freq_table = {}

  for i in range(0, len(intervals)-1):
      freq_table[tuple([intervals[i], intervals[i+1]])] = int(freqs[i])

  return freq_table

In [25]:
def cdf(random_variable):
  x, counts = np.unique(random_variable, return_counts=True)
  cusum = np.cumsum(counts)
  cdf = cusum / cusum[-1]
  return x, cdf

In [26]:
def generate_multivariate_data(X, bins=10, N=1000):

    # i) generate matrix of empirical distributions

    matrix_F = X.copy(deep=True)

    for i in matrix_F.columns:
        X_column_i = matrix_F[i]
        x_sort_i, F_i = cdf(X_column_i)
        matrix_F[i] = [F_i[np.where(x_sort_i == z)[0][0]] for z in X_column_i]

    # ii) A frequency table is constructed for each variable with
    # the given number of bins.

    dicc_freq_tables = {}

    for i in X.columns:
        X_column_i = X[i]
        simple_table = frequency_table(X_column_i, bins=bins)
        complete_table = pd.DataFrame.from_dict(simple_table, orient='index',
                                                columns=['Freq_abs'])
        freq_rel = [j/len(X_column_i) for j in simple_table.values()]
        complete_table['Freq_rel'] = freq_rel
        complete_table['Freq_acum'] = np.cumsum(freq_rel)

        dicc_freq_tables[i] = complete_table

    # iii) - iv)  List of N integers between 0 and n-1

    list_N = np.random.randint(low=0, high=len(matrix_F), size=N)

    # v) - vi)  Simulation

    X_generated = pd.DataFrame(columns=X.columns)

    for sub_n in list_N:

        random_generated = []

        for i in X.columns:

            h = matrix_F.loc[sub_n, i]
            # inverval or freq_table where is the percentile
            interval = next((j for j in range(0, len(
                dicc_freq_tables[i]['Freq_acum'])) if dicc_freq_tables[i]['Freq_acum'][j] >= (h)), None)
            if interval == None:
                interval = -1

            lim_inf = dicc_freq_tables[i].index[interval][0]
            lim_sup = dicc_freq_tables[i].index[interval][1]
            # random_generated.append(np.random.uniform(
            #     low=lim_inf, high=lim_sup, size=1)[0])
            random_generated.append(np.random.normal((lim_inf+lim_sup)/2,(lim_sup-lim_inf)/7,1)[0])

        random_generated = np.array(random_generated).T
        random_generated = pd.DataFrame([random_generated], columns=X.columns)
        X_generated = pd.concat(
            [X_generated, random_generated], ignore_index=True)

    return X_generated

#Generación de datos sinteticos

In [27]:
df_real = df
df_synthetic = generate_multivariate_data(X=df_real, bins=25, N=150000)

df_real['Type'] = 'real'
df_synthetic['Type'] = 'synthetic'

df_total = pd.concat([df_real, df_synthetic]).reset_index(drop=True)

In [29]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150143 entries, 0 to 150142
Data columns (total 46 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   1       150143 non-null  float64
 1   2       150143 non-null  float64
 2   3       150143 non-null  float64
 3   4       150143 non-null  float64
 4   5       150143 non-null  float64
 5   6       150143 non-null  float64
 6   7       150143 non-null  float64
 7   8       150143 non-null  float64
 8   9       150143 non-null  float64
 9   10      150143 non-null  float64
 10  11      150143 non-null  float64
 11  12      150143 non-null  float64
 12  13      150143 non-null  float64
 13  14      150143 non-null  float64
 14  15      150143 non-null  float64
 15  16      150143 non-null  float64
 16  17      150143 non-null  float64
 17  18      150143 non-null  float64
 18  19      150143 non-null  float64
 19  20      150143 non-null  float64
 20  21      150143 non-null  float64
 21  22      15

In [30]:
from google.colab import files
df.to_csv('data_clean.csv', encoding = 'utf-8-sig') 
files.download('data_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>