In [1]:
import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Model Based Simulation Code PRINTING STREAMS

# Steam representation: Bid price / Ask price / Sell or Buy (1 Sell, 0 Buy) / New Order or Cancellation (1 New, 0 Cancel) 
#                       / Market or Limit (1 Market, 0 Limit) / Relative Distance (in ticks) / Volume (just 1 in our case)

StreamList = [1, 1, 1, 1, 1, 1, 1]

# Model parameters 

# Number of price levels

N = 20

priceLevels = list(range(1, N+1))
# print(priceLevels)


# Sell side limit order parameters 

# Sell side limit order arrival hyperparameters 
ks = 1.92
alphas = 0.52

lambdaSell = np.zeros(N)


for i in priceLevels: 
    lambdaSell[i-1] = ks / (i ** alphas )

lambdaSell[0] = 1.85
lambdaSell[1] = 1.51
lambdaSell[2] = 1.09
lambdaSell[3] = 0.88
lambdaSell[4] = 0.77


# Buy side limit order parameters 

# Buy side limit order arrival hyperparameters 
kb = 1.92
alphab = 0.52

lambdaBuy = np.zeros(N)

for i in priceLevels: 
    lambdaBuy[i-1] = kb / (i ** alphab )

lambdaBuy[0] = 1.85
lambdaBuy[1] = 1.51
lambdaBuy[2] = 1.09
lambdaBuy[3] = 0.88
lambdaBuy[4] = 0.77



# Sell and buy side market order parameter 

gammaSell = 0.94
gammaBuy = 0.94




# Sell side cancelation parameters 

thetaSell = np.zeros(N)

for i in priceLevels: 
    thetaSell[i-1] = 0.47


thetaSell[0] = 0.71
thetaSell[1] = 0.81
thetaSell[2] = 0.68
thetaSell[3] = 0.56
thetaSell[4] = 0.47



# Buy side cancelation parameters 

thetaBuy = np.zeros(N)

for i in priceLevels: 
    thetaBuy[i-1] = 0.47


thetaBuy[0] = 0.71
thetaBuy[1] = 0.81
thetaBuy[2] = 0.68
thetaBuy[3] = 0.56
thetaBuy[4] = 0.47



# Initiating current state 

s = np.zeros(N+2)

# Including -1 and +1 artificial limits outside our price range 

s[0] = -1 
s[N+1] = +1 

# Initiating an artificial instance

for i in priceLevels: 
    if i <= N/4 :
        s[i] = -i
    elif i <= N/2 :
        s[i] = -(N/2 - i + 1)
    elif i <= 3*N/4 :
        s[i] = i-(N/2)
    else: 
        s[i] = N-i+1

# print(s)


#sampleList = []



# Running orders upto a specified time 

T = 100    # Final time
t = 0      # Current time 

while t <= T: 
    # Identifying the bid and ask price 
    #sampleList.append(s)
    
    b = -1 
    a = N+1
    
    currentStream = []
    
    for i in priceLevels: 
        if s[i] < 0:
            b = i          # Our bid price 
        
        if s[N+1-i] > 0: 
            a = N+1-i      # Our ask price
    # print([a, b])

    currentStream.append(b)   # Adding the bid price (before order arrives)
    currentStream.append(a)   # Adding the ask price (before order arrives)

    # Computing the rate of the next executed order

    orderExeRate = 0 

    # Adding buy limit rates 

    for l in list(range(1, a)):
        orderExeRate = orderExeRate + lambdaBuy[l-1]

    # Adding sell limit rates 

    for l in list(range(1, N-b+1)):
        orderExeRate = orderExeRate + lambdaSell[l-1]


    # Addding market buy and market sell rates 

    orderExeRate = orderExeRate + gammaSell + gammaBuy 




    # Adding buy cencelation rates 

    for l in list(range(1, a)):
        orderExeRate = orderExeRate + thetaBuy[l-1] * (-s[a-l])
        # orderExeRate = orderExeRate + thetaBuy[a-l-1] * (-s[l])

    # Adding sell cencelation rates 

    for l in list(range(1, N-b+1)):
        orderExeRate = orderExeRate + thetaSell[l-1] * s[b+l]

    # print(orderExeRate)


    # Creating the next executed order time and updating current time

    U = np.random.uniform(low = 0.0, high = 1.0, size = None)            
    V = - math.log(1-U) 
    
    t = t + V / orderExeRate 


    
    # Deciding which event will be realized and updating the state

    W = np.random.uniform(low = 0.0, high = 1.0, size = None) * orderExeRate 

    orderExeRate2 = 0

    # Checking if limit buy order is realized
    for l in list(range(1, a)):
        orderExeRate2 = orderExeRate2 + lambdaBuy[l-1]
        if orderExeRate2 > W:
            # a limit buy at distance l to ask price is ordered 
            s[a-l] = s[a-l] - 1

            W = 100 * orderExeRate 
            
            currentStream.append(0)   # Buy side, hence 0
            currentStream.append(1)   # New order, hence 1
            currentStream.append(0)   # Limit order, hence 0
            currentStream.append(l)   # Relative distance, l
            currentStream.append(1)   # Volume, 1, no data 

            # print(currentStream)
            # print(s)

            StreamList = np.vstack([StreamList, currentStream])

            break

    # Checking if limit sell order is realized
    for l in list(range(1, N-b+1)):
        orderExeRate2 = orderExeRate2 + lambdaSell[l-1]
        if orderExeRate2 > W:
            # a limit sell at distance l to bid price is ordered 
            s[b+l] = s[b+l] + 1

            W = 100 * orderExeRate 

            currentStream.append(1)   # Sell side, hence 1
            currentStream.append(1)   # New order, hence 1
            currentStream.append(0)   # Limit order, hence 0
            currentStream.append(l)   # Relative distance, l
            currentStream.append(1)   # Volume, 1, no data 
            
            # print(currentStream)
            # print(s)

            StreamList = np.vstack([StreamList, currentStream])

            break



    # Checking if market buy order is realized
    orderExeRate2 = orderExeRate2 + gammaBuy
    if orderExeRate2 > W:
        # a market buy order is realized
        s[a] = s[a] - 1

        W = 100 * orderExeRate 

        currentStream.append(0)   # Buy side, hence 0
        currentStream.append(1)   # New order, hence 1
        currentStream.append(1)   # Market order, hence 1
        currentStream.append(0)   # Relative distance, 0
        currentStream.append(1)   # Volume, 1, no data 

        # print(currentStream)
        # print(s)

        StreamList = np.vstack([StreamList, currentStream])

    # Checking if market sell order is realized
    orderExeRate2 = orderExeRate2 + gammaSell
    if orderExeRate2 > W:
        # a market sell order is realized
        s[b] = s[b] + 1

        W = 100 * orderExeRate 
        
        currentStream.append(1)   # Sell side, hence 1
        currentStream.append(1)   # New order, hence 1
        currentStream.append(1)   # Market order, hence 1
        currentStream.append(0)   # Relative distance, 0
        currentStream.append(1)   # Volume, 1, no data 

        # print(currentStream)
        # print(s)

        StreamList = np.vstack([StreamList, currentStream])

    # Checking if limit buy order cancellation is realized

    for l in list(range(1, a)):
        orderExeRate2 = orderExeRate2 + thetaBuy[l-1] * (-s[a-l])
        # orderExeRate = orderExeRate + thetaBuy[a-l-1] * (-s[l])
        if orderExeRate2 > W:
            # a limit buy at distance l to bid price is cancelled 
            s[a-l] = s[a-l] + 1
            
            W = 100 * orderExeRate 

            currentStream.append(0)   # Buy side, hence 0
            currentStream.append(0)   # Cancellation of order, hence 0
            currentStream.append(0)   # Limit order, hence 0
            currentStream.append(l)   # Relative distance, l
            currentStream.append(1)   # Volume, 1, no data 

            # print(currentStream)
            # print(s)

            StreamList = np.vstack([StreamList, currentStream])
            
            break



    # Checking if limit sell order cancellation is realized

    for l in list(range(1, N-b+1)):
        orderExeRate2 = orderExeRate2 + thetaSell[l-1] * s[b+l]
        
        if orderExeRate2 > W:
            # a limit buy at distance l to ask price is cancelled 
            s[b+l] = s[b+l] - 1

            W = 100 * orderExeRate 

            currentStream.append(1)   # Sell side, hence 1
            currentStream.append(0)   # Cancellation of order, hence 0
            currentStream.append(0)   # Limit order, hence 0
            currentStream.append(l)   # Relative distance, l
            currentStream.append(1)   # Volume, 1, no data 

            # print(currentStream)
            # print(s)

            StreamList = np.vstack([StreamList, currentStream])

            break


# print(StreamList)  




## convert your array into a dataframe
df = pd.DataFrame (StreamList)

## save to xlsx file

filepath = 'my_excel_file.xlsx'

df.to_excel(filepath, index=False)



In [3]:
# Model Based Simulation Code PRINTING STREAMS

# Steam representation: Bid price / Ask price / Sell or Buy (1 Sell, 0 Buy) / New Order or Cancellation (1 New, 0 Cancel) 
#                       / Market or Limit (1 Market, 0 Limit) / Relative Distance (in ticks) / Volume (just 1 in our case)

StreamList = [1, 1, 1, 1, 1, 1, 1]

# Model parameters 

# Number of price levels

N = 20

priceLevels = list(range(1, N+1))
# print(priceLevels)


# Sell side limit order parameters 

# Sell side limit order arrival hyperparameters 
ks = 1.92
alphas = 0.52

lambdaSell = np.zeros(N)


for i in priceLevels: 
    lambdaSell[i-1] = ks / (i ** alphas )

lambdaSell[0] = 1.85
lambdaSell[1] = 1.51
lambdaSell[2] = 1.09
lambdaSell[3] = 0.88
lambdaSell[4] = 0.77


# Buy side limit order parameters 

# Buy side limit order arrival hyperparameters 
kb = 1.92
alphab = 0.52

lambdaBuy = np.zeros(N)

for i in priceLevels: 
    lambdaBuy[i-1] = kb / (i ** alphab )

lambdaBuy[0] = 1.85
lambdaBuy[1] = 1.51
lambdaBuy[2] = 1.09
lambdaBuy[3] = 0.88
lambdaBuy[4] = 0.77



# Sell and buy side market order parameter 

gammaSell = 0.94
gammaBuy = 0.94




# Sell side cancelation parameters 

thetaSell = np.zeros(N)

for i in priceLevels: 
    thetaSell[i-1] = 0.47


thetaSell[0] = 0.71
thetaSell[1] = 0.81
thetaSell[2] = 0.68
thetaSell[3] = 0.56
thetaSell[4] = 0.47



# Buy side cancelation parameters 

thetaBuy = np.zeros(N)

for i in priceLevels: 
    thetaBuy[i-1] = 0.47


thetaBuy[0] = 0.71
thetaBuy[1] = 0.81
thetaBuy[2] = 0.68
thetaBuy[3] = 0.56
thetaBuy[4] = 0.47







# Running Big Replications 

# BigRepNumber = 10
# BigReplications = list(range(1, BigRepNumber+1))
RequiredtotalSuccessfulRepNumber = 10
totalSuccessfulRepNumber = 0


BigStreamList = [1, 1, 1, 1, 1, 1, 1]

# for r in BigReplications: 
while totalSuccessfulRepNumber < RequiredtotalSuccessfulRepNumber:

    StreamList = [1, 1, 1, 1, 1, 1, 1] 


    # Initiating current state 

    s = np.zeros(N+2)

    # Including -1 and +1 artificial limits outside our price range 

    s[0] = -1 
    s[N+1] = +1 

    # Initiating an artificial instance

    for i in priceLevels: 
        if i <= N/4 :
            s[i] = -i
        elif i <= N/2 :
            s[i] = -(N/2 - i + 1)
        elif i <= 3*N/4 :
            s[i] = i-(N/2)
        else: 
            s[i] = N-i+1

    # print(s)

    #sampleList = []



    # Running orders upto a specified time 

    T = 100    # Final time
    t = 0      # Current time 

    while t <= T: 
        # Identifying the bid and ask price 
        #sampleList.append(s)
        
        b = -1 
        a = N+1
        
        currentStream = []
        
        for i in priceLevels: 
            if s[i] < 0:
                b = i          # Our bid price 
            
            if s[N+1-i] > 0: 
                a = N+1-i      # Our ask price
        # print([a, b])

        # Lets go out of while loop for weird cases
        if b==-1: 
            break
        elif a==N+1:
            break 

        currentStream.append(b)   # Adding the bid price (before order arrives)
        currentStream.append(a)   # Adding the ask price (before order arrives)

        # Computing the rate of the next executed order

        orderExeRate = 0 

        # Adding buy limit rates 

        for l in list(range(1, a)):
            orderExeRate = orderExeRate + lambdaBuy[l-1]

        # Adding sell limit rates 

        for l in list(range(1, N-b+1)):
            orderExeRate = orderExeRate + lambdaSell[l-1]


        # Addding market buy and market sell rates 

        orderExeRate = orderExeRate + gammaSell + gammaBuy 




        # Adding buy cencelation rates 

        for l in list(range(1, a)):
            orderExeRate = orderExeRate + thetaBuy[l-1] * (-s[a-l])
            # orderExeRate = orderExeRate + thetaBuy[a-l-1] * (-s[l])

        # Adding sell cencelation rates 

        for l in list(range(1, N-b+1)):
            orderExeRate = orderExeRate + thetaSell[l-1] * s[b+l]

        # print(orderExeRate)


        # Creating the next executed order time and updating current time

        U = np.random.uniform(low = 0.0, high = 1.0, size = None)            
        V = - math.log(1-U) 
        
        t = t + V / orderExeRate 


        
        # Deciding which event will be realized and updating the state

        W = np.random.uniform(low = 0.0, high = 1.0, size = None) * orderExeRate 

        orderExeRate2 = 0

        # Checking if limit buy order is realized
        for l in list(range(1, a)):
            orderExeRate2 = orderExeRate2 + lambdaBuy[l-1]
            if orderExeRate2 > W:
                # a limit buy at distance l to ask price is ordered 
                s[a-l] = s[a-l] - 1

                W = 100 * orderExeRate 
                
                currentStream.append(0)   # Buy side, hence 0
                currentStream.append(1)   # New order, hence 1
                currentStream.append(0)   # Limit order, hence 0
                currentStream.append(l)   # Relative distance, l
                currentStream.append(1)   # Volume, 1, no data 

                # print(currentStream)
                # print(s)

                StreamList = np.vstack([StreamList, currentStream])

                break

        # Checking if limit sell order is realized
        for l in list(range(1, N-b+1)):
            orderExeRate2 = orderExeRate2 + lambdaSell[l-1]
            if orderExeRate2 > W:
                # a limit sell at distance l to bid price is ordered 
                s[b+l] = s[b+l] + 1

                W = 100 * orderExeRate 

                currentStream.append(1)   # Sell side, hence 1
                currentStream.append(1)   # New order, hence 1
                currentStream.append(0)   # Limit order, hence 0
                currentStream.append(l)   # Relative distance, l
                currentStream.append(1)   # Volume, 1, no data 
                
                # print(currentStream)
                # print(s)

                StreamList = np.vstack([StreamList, currentStream])

                break



        # Checking if market buy order is realized
        orderExeRate2 = orderExeRate2 + gammaBuy
        if orderExeRate2 > W:
            # a market buy order is realized
            s[a] = s[a] - 1

            W = 100 * orderExeRate 

            currentStream.append(0)   # Buy side, hence 0
            currentStream.append(1)   # New order, hence 1
            currentStream.append(1)   # Market order, hence 1
            currentStream.append(0)   # Relative distance, 0
            currentStream.append(1)   # Volume, 1, no data 

            # print(currentStream)
            # print(s)

            StreamList = np.vstack([StreamList, currentStream])

        # Checking if market sell order is realized
        orderExeRate2 = orderExeRate2 + gammaSell
        if orderExeRate2 > W:
            # a market sell order is realized
            s[b] = s[b] + 1

            W = 100 * orderExeRate 
            
            currentStream.append(1)   # Sell side, hence 1
            currentStream.append(1)   # New order, hence 1
            currentStream.append(1)   # Market order, hence 1
            currentStream.append(0)   # Relative distance, 0
            currentStream.append(1)   # Volume, 1, no data 

            # print(currentStream)
            # print(s)

            StreamList = np.vstack([StreamList, currentStream])

        # Checking if limit buy order cancellation is realized

        for l in list(range(1, a)):
            orderExeRate2 = orderExeRate2 + thetaBuy[l-1] * (-s[a-l])
            # orderExeRate = orderExeRate + thetaBuy[a-l-1] * (-s[l])
            if orderExeRate2 > W:
                # a limit buy at distance l to bid price is cancelled 
                s[a-l] = s[a-l] + 1
                
                W = 100 * orderExeRate 

                currentStream.append(0)   # Buy side, hence 0
                currentStream.append(0)   # Cancellation of order, hence 0
                currentStream.append(0)   # Limit order, hence 0
                currentStream.append(l)   # Relative distance, l
                currentStream.append(1)   # Volume, 1, no data 

                # print(currentStream)
                # print(s)

                StreamList = np.vstack([StreamList, currentStream])
                
                break



        # Checking if limit sell order cancellation is realized

        for l in list(range(1, N-b+1)):
            orderExeRate2 = orderExeRate2 + thetaSell[l-1] * s[b+l]
            
            if orderExeRate2 > W:
                # a limit buy at distance l to ask price is cancelled 
                s[b+l] = s[b+l] - 1

                W = 100 * orderExeRate 

                currentStream.append(1)   # Sell side, hence 1
                currentStream.append(0)   # Cancellation of order, hence 0
                currentStream.append(0)   # Limit order, hence 0
                currentStream.append(l)   # Relative distance, l
                currentStream.append(1)   # Volume, 1, no data 

                # print(currentStream)
                # print(s)

                StreamList = np.vstack([StreamList, currentStream])

                break

    # Lets not include bad cases 
    if b>-1:
        if a<N+1: 
            StreamList = np.delete(StreamList, slice(0, N**2), axis=0)
            BigStreamList = np.vstack([BigStreamList, StreamList])
            totalSuccessfulRepNumber = totalSuccessfulRepNumber + 1
    
    # print(StreamList)  
    # BigStreamList = np.vstack([BigStreamList, StreamList])
    # totalSuccessfulRepNumber = totalSuccessfulRepNumber + 1

print(totalSuccessfulRepNumber)
BigStreamList = np.delete(BigStreamList, slice(0, 1), axis=0) 

## convert your array into a dataframe
df = pd.DataFrame (BigStreamList)

## save to xlsx file

filepath = 'Data_1.xlsx'

df.to_excel(filepath, index=False)


10


In [4]:
import pandas as pd
  
# Read and store content
# of an excel file 
read_file = pd.read_excel ("Data_1.xlsx")
  
# Write the dataframe object
# into csv file
read_file.to_csv ("Data_1.csv", 
                  index = None,
                  header=True)
    


In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
import torch
from torch import optim
from torch.nn import functional
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential
from sklearn.exceptions import ConvergenceWarning
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils._testing import ignore_warnings

In [22]:
data1 = pd.read_csv('Data_1.csv')

df = pd.concat([data1], ignore_index=True)

In [23]:
print("Number of rows:", (df.values).shape[0])
print("Number of columns:", (df.values).shape[1])
print("Columns Names:", df.columns)
df

Number of rows: 34380
Number of columns: 7
Columns Names: Index(['0', '1', '2', '3', '4', '5', '6'], dtype='object')


Unnamed: 0,0,1,2,3,4,5,6
0,7,8,0,1,0,2,1
1,7,8,1,0,0,6,1
2,7,8,0,0,0,5,1
3,7,8,0,0,0,4,1
4,7,8,1,0,0,1,1
...,...,...,...,...,...,...,...
34375,7,8,0,0,0,7,1
34376,7,8,1,1,0,7,1
34377,7,8,1,1,0,13,1
34378,7,8,1,1,0,10,1


In [24]:
discard = []

# Output
print("Number of rows before discarding:", (df.values).shape[0])

# Appending indices in a list which will be discarded
for i in range(0, (df.values).shape[0]):
  if((df['1'][i] - df['0'][i]) >= 4):
    discard.append(i)

# Dropping those indices from the dataframe
for i in discard:
  df = df.drop(i)

# Output
print("Number of rows after discarding:", (df.values).shape[0])

Number of rows before discarding: 34380
Number of rows after discarding: 33870


In [25]:
# Checking the datatype of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33870 entries, 0 to 34379
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       33870 non-null  int64
 1   1       33870 non-null  int64
 2   2       33870 non-null  int64
 3   3       33870 non-null  int64
 4   4       33870 non-null  int64
 5   5       33870 non-null  int64
 6   6       33870 non-null  int64
dtypes: int64(7)
memory usage: 2.1 MB


In [26]:
# Dropping column 0 and column 1, since those are not 
# relevant to our study
df = df.drop(['0'], axis = 1)
df = df.drop(['1'], axis = 1)

In [27]:
print("Data Distribution in column 2: ")
print(df['2'].value_counts())
print("\nData Distribution in column 3: ")
print(df['3'].value_counts())
print("\nData Distribution in column 4: ")
print(df['4'].value_counts())
print("\nData Distribution in column 5: ")
print(df['5'].value_counts())
print("\nData Distribution in column 6: ")
print(df['6'].value_counts())

Data Distribution in column 2: 
0    16978
1    16892
Name: 2, dtype: int64

Data Distribution in column 3: 
1    18565
0    15305
Name: 3, dtype: int64

Data Distribution in column 4: 
0    32197
1     1673
Name: 4, dtype: int64

Data Distribution in column 5: 
1     5521
2     5194
3     3979
4     3129
5     2589
6     2483
7     2149
8     1907
0     1673
9     1553
10    1155
11     942
12     619
13     433
14     247
15     147
16      78
17      54
18      16
19       2
Name: 5, dtype: int64

Data Distribution in column 6: 
1    33870
Name: 6, dtype: int64


In [28]:
print("Data Distribution in column 2: ")
print(df['2'].value_counts())
print("\nData Distribution in column 3: ")
print(df['3'].value_counts())
print("\nData Distribution in column 4: ")
print(df['4'].value_counts())
print("\nData Distribution in column 5: ")
print(df['5'].value_counts())
print("\nData Distribution in column 6: ")
print(df['6'].value_counts())

realCol2 = df['2'].value_counts()
realCol3 = df['3'].value_counts()
realCol4 = df['4'].value_counts()

realCol2Ratio = realCol2[0] / ( realCol2[0] + realCol2[1]  )
realCol3Ratio = realCol3[0] / ( realCol3[0] + realCol3[1]  )
realCol4Ratio = realCol4[0] / ( realCol4[0] + realCol4[1]  )


print("Column 2 (Buy): Real Ratio:", realCol2Ratio)
print("Column 3 (New (not cancel)):  Real Ratio: ", realCol3Ratio)
print("Column 4 (Limit): Real Ratio: ", realCol4Ratio)

Data Distribution in column 2: 
0    16978
1    16892
Name: 2, dtype: int64

Data Distribution in column 3: 
1    18565
0    15305
Name: 3, dtype: int64

Data Distribution in column 4: 
0    32197
1     1673
Name: 4, dtype: int64

Data Distribution in column 5: 
1     5521
2     5194
3     3979
4     3129
5     2589
6     2483
7     2149
8     1907
0     1673
9     1553
10    1155
11     942
12     619
13     433
14     247
15     147
16      78
17      54
18      16
19       2
Name: 5, dtype: int64

Data Distribution in column 6: 
1    33870
Name: 6, dtype: int64
Column 2 (Buy): Real Ratio: 0.501269560082669
Column 3 (New (not cancel)):  Real Ratio:  0.45187481547091823
Column 4 (Limit): Real Ratio:  0.9506052553882491


In [29]:
class ConditionalGenerator(object):
    def __init__(self, data, output_info, log_frequency):
        self.model = []

        start = 0
        skip = False
        max_interval = 0
        counter = 0
        for item in output_info:
            if item[1] == 'tanh':
                start += item[0]
                skip = True
                continue

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    start += item[0]
                    continue

                end = start + item[0]
                max_interval = max(max_interval, end - start)
                counter += 1
                self.model.append(np.argmax(data[:, start:end], axis=-1))
                start = end

            else:
                assert 0

        assert start == data.shape[1]

        self.interval = []
        self.n_col = 0
        self.n_opt = 0
        skip = False
        start = 0
        self.p = np.zeros((counter, max_interval))
        for item in output_info:
            if item[1] == 'tanh':
                skip = True
                start += item[0]
                continue
            elif item[1] == 'softmax':
                if skip:
                    start += item[0]
                    skip = False
                    continue
                end = start + item[0]
                tmp = np.sum(data[:, start:end], axis=0)
                if log_frequency:
                    tmp = np.log(tmp + 1)
                tmp = tmp / np.sum(tmp)
                self.p[self.n_col, :item[0]] = tmp
                self.interval.append((self.n_opt, item[0]))
                self.n_opt += item[0]
                self.n_col += 1
                start = end
            else:
                assert 0

        self.interval = np.asarray(self.interval)

    def random_choice_prob_index(self, idx):
        a = self.p[idx]
        r = np.expand_dims(np.random.rand(a.shape[0]), axis=1)
        return (a.cumsum(axis=1) > r).argmax(axis=1)

    def sample(self, batch):
        if self.n_col == 0:
            return None

        batch = batch
        idx = np.random.choice(np.arange(self.n_col), batch)

        vec1 = np.zeros((batch, self.n_opt), dtype='float32')
        mask1 = np.zeros((batch, self.n_col), dtype='float32')
        mask1[np.arange(batch), idx] = 1
        opt1prime = self.random_choice_prob_index(idx)
        opt1 = self.interval[idx, 0] + opt1prime
        vec1[np.arange(batch), opt1] = 1

        return vec1, mask1, idx, opt1prime

    def sample_zero(self, batch):
        if self.n_col == 0:
            return None

        vec = np.zeros((batch, self.n_opt), dtype='float32')
        idx = np.random.choice(np.arange(self.n_col), batch)
        for i in range(batch):
            col = idx[i]
            pick = int(np.random.choice(self.model[col]))
            vec[i, pick + self.interval[col, 0]] = 1

        return vec

class Discriminator(Module):

    def calc_gradient_penalty(self, real_data, fake_data, device='cpu', pac=10, lambda_=10):

        alpha = torch.rand(real_data.size(0) // pac, 1, 1, device=device)
        alpha = alpha.repeat(1, pac, real_data.size(1))
        alpha = alpha.view(-1, real_data.size(1))

        interpolates = alpha * real_data + ((1 - alpha) * fake_data)

        disc_interpolates = self(interpolates)

        gradients = torch.autograd.grad(
            outputs=disc_interpolates, inputs=interpolates,
            grad_outputs=torch.ones(disc_interpolates.size(), device=device),
            create_graph=True, retain_graph=True, only_inputs=True
        )[0]

        gradient_penalty = ((
            gradients.view(-1, pac * real_data.size(1)).norm(2, dim=1) - 1
        ) ** 2).mean() * lambda_

        return gradient_penalty

    def __init__(self, input_dim, dis_dims, pack=10):
        super(Discriminator, self).__init__()
        dim = input_dim * pack
        self.pack = pack
        self.packdim = dim
        seq = []
        for item in list(dis_dims):
            seq += [Linear(dim, item), LeakyReLU(0.2), Dropout(0.5)]
            dim = item

        seq += [Linear(dim, 1)]
        self.seq = Sequential(*seq)

    def forward(self, input):
        assert input.size()[0] % self.pack == 0
        return self.seq(input.view(-1, self.packdim))


class Residual(Module):
    def __init__(self, i, o):
        super(Residual, self).__init__()
        self.fc = Linear(i, o)
        self.bn = BatchNorm1d(o)
        self.relu = ReLU()

    def forward(self, input):
        out = self.fc(input)
        out = self.bn(out)
        out = self.relu(out)
        return torch.cat([out, input], dim=1)


class Generator(Module):
    def __init__(self, embedding_dim, gen_dims, data_dim):
        super(Generator, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(gen_dims):
            seq += [Residual(dim, item)]
            dim += item
        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)

    def forward(self, input):
        data = self.seq(input)
        return data

class Sampler(object):
    """docstring for Sampler."""

    def __init__(self, data, output_info):
        super(Sampler, self).__init__()
        self.data = data
        self.model = []
        self.n = len(data)

        st = 0
        skip = False
        for item in output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True
            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                tmp = []
                for j in range(item[0]):
                    tmp.append(np.nonzero(data[:, st + j])[0])

                self.model.append(tmp)
                st = ed
            else:
                assert 0

        assert st == data.shape[1]

    def sample(self, n, col, opt):
        if col is None:
            idx = np.random.choice(np.arange(self.n), n)
            return self.data[idx]

        idx = []
        for c, o in zip(col, opt):
            idx.append(np.random.choice(self.model[c][o]))

        return self.data[idx]

class DataTransformer(object):
    def __init__(self, n_clusters=10, epsilon=0.005):
        self.n_clusters = n_clusters
        self.epsilon = epsilon

    @ignore_warnings(category=ConvergenceWarning)
    def _fit_continuous(self, column, data):
        gm = BayesianGaussianMixture(
            self.n_clusters,
            weight_concentration_prior_type='dirichlet_process',
            weight_concentration_prior=0.001,
            n_init=1
        )
        gm.fit(data)
        components = gm.weights_ > self.epsilon
        num_components = components.sum()

        return {
            'name': column,
            'model': gm,
            'components': components,
            'output_info': [(1, 'tanh'), (num_components, 'softmax')],
            'output_dimensions': 1 + num_components,
        }

    def _fit_discrete(self, column, data):
        ohe = OneHotEncoder(sparse=False)
        ohe.fit(data)
        categories = len(ohe.categories_[0])

        return {
            'name': column,
            'encoder': ohe,
            'output_info': [(categories, 'softmax')],
            'output_dimensions': categories
        }

    def fit(self, data, discrete_columns=tuple()):
        self.output_info = []
        self.output_dimensions = 0

        if not isinstance(data, pd.DataFrame):
            self.dataframe = False
            data = pd.DataFrame(data)
        else:
            self.dataframe = True

        self.meta = []
        for column in data.columns:
            column_data = data[[column]].values
            if column in discrete_columns:
                meta = self._fit_discrete(column, column_data)
            else:
                meta = self._fit_continuous(column, column_data)

            self.output_info += meta['output_info']
            self.output_dimensions += meta['output_dimensions']
            self.meta.append(meta)

    def _transform_continuous(self, column_meta, data):
        components = column_meta['components']
        model = column_meta['model']

        means = model.means_.reshape((1, self.n_clusters))
        stds = np.sqrt(model.covariances_).reshape((1, self.n_clusters))
        features = (data - means) / (4 * stds)

        probs = model.predict_proba(data)

        n_opts = components.sum()
        features = features[:, components]
        probs = probs[:, components]

        opt_sel = np.zeros(len(data), dtype='int')
        for i in range(len(data)):
            pp = probs[i] + 1e-6
            pp = pp / pp.sum()
            opt_sel[i] = np.random.choice(np.arange(n_opts), p=pp)

        idx = np.arange((len(features)))
        features = features[idx, opt_sel].reshape([-1, 1])
        features = np.clip(features, -.99, .99)

        probs_onehot = np.zeros_like(probs)
        probs_onehot[np.arange(len(probs)), opt_sel] = 1
        return [features, probs_onehot]

    def _transform_discrete(self, column_meta, data):
        encoder = column_meta['encoder']
        return encoder.transform(data)

    def transform(self, data):
        if not isinstance(data, pd.DataFrame):
            data = pd.DataFrame(data)

        values = []
        for meta in self.meta:
            column_data = data[[meta['name']]].values
            if 'model' in meta:
                values += self._transform_continuous(meta, column_data)
            else:
                values.append(self._transform_discrete(meta, column_data))

        return np.concatenate(values, axis=1).astype(float)

    def _inverse_transform_continuous(self, meta, data, sigma):
        model = meta['model']
        components = meta['components']

        u = data[:, 0]
        v = data[:, 1:]

        if sigma is not None:
            u = np.random.normal(u, sigma)

        u = np.clip(u, -1, 1)
        v_t = np.ones((len(data), self.n_clusters)) * -100
        v_t[:, components] = v
        v = v_t
        means = model.means_.reshape([-1])
        stds = np.sqrt(model.covariances_).reshape([-1])
        p_argmax = np.argmax(v, axis=1)
        std_t = stds[p_argmax]
        mean_t = means[p_argmax]
        column = u * 4 * std_t + mean_t

        return column

    def _inverse_transform_discrete(self, meta, data):
        encoder = meta['encoder']
        return encoder.inverse_transform(data)

    def inverse_transform(self, data, sigmas):
        start = 0
        output = []
        column_names = []
        for meta in self.meta:
            dimensions = meta['output_dimensions']
            columns_data = data[:, start:start + dimensions]

            if 'model' in meta:
                sigma = sigmas[start] if sigmas else None
                inverted = self._inverse_transform_continuous(meta, columns_data, sigma)
            else:
                inverted = self._inverse_transform_discrete(meta, columns_data)

            output.append(inverted)
            column_names.append(meta['name'])
            start += dimensions

        output = np.column_stack(output)
        if self.dataframe:
            output = pd.DataFrame(output, columns=column_names)
        
        x = list(output[(output['2'] == 0) & (output['3'] == 0) & (output['4'] == 1)].index)
        y = list(output[(output['2'] == 1) & (output['3'] == 0) & (output['4'] == 1)].index)

        for i in x:
          output = output.drop(i)
        
        for j in y:
          output = output.drop(j)
      
        return output

class PSGANS(object):
    def __init__(self, embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256),
                 l2scale=1e-6, batch_size=500):

        self.embedding_dim = embedding_dim
        self.gen_dim = gen_dim
        self.dis_dim = dis_dim

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.trained_epoches = 0

    def _apply_activate(self, data):
        data_t = []
        st = 0
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                ed = st + item[0]
                data_t.append(torch.tanh(data[:, st:ed]))
                st = ed
            elif item[1] == 'softmax':
                ed = st + item[0]
                data_t.append(functional.gumbel_softmax(data[:, st:ed], tau=0.2))
                st = ed
            else:
                assert 0

        return torch.cat(data_t, dim=1)

    def _cond_loss(self, data, c, m):
        loss = []
        st = 0
        st_c = 0
        skip = False
        for item in self.transformer.output_info:
            if item[1] == 'tanh':
                st += item[0]
                skip = True

            elif item[1] == 'softmax':
                if skip:
                    skip = False
                    st += item[0]
                    continue

                ed = st + item[0]
                ed_c = st_c + item[0]
                tmp = functional.cross_entropy(
                    data[:, st:ed],
                    torch.argmax(c[:, st_c:ed_c], dim=1),
                    reduction='none'
                )
                loss.append(tmp)
                st = ed
                st_c = ed_c

            else:
                assert 0

        loss = torch.stack(loss, dim=1)

        return (loss * m).sum() / data.size()[0]

    def fit(self, train_data, discrete_columns=tuple(), epochs=300, log_frequency=True):
        if not hasattr(self, "transformer"):
            self.transformer = DataTransformer()
            self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)

        data_sampler = Sampler(train_data, self.transformer.output_info)

        data_dim = self.transformer.output_dimensions

        if not hasattr(self, "cond_generator"):
            self.cond_generator = ConditionalGenerator(
                train_data,
                self.transformer.output_info,
                log_frequency
            )

        if not hasattr(self, "generator"):
            self.generator = Generator(
                self.embedding_dim + self.cond_generator.n_opt,
                self.gen_dim,
                data_dim
            ).to(self.device)

        if not hasattr(self, "discriminator"):
            self.discriminator = Discriminator(
                data_dim + self.cond_generator.n_opt,
                self.dis_dim
            ).to(self.device)

        if not hasattr(self, "optimizerG"):
            self.optimizerG = optim.Adam(
                self.generator.parameters(), lr=2e-4, betas=(0.5, 0.9),
                weight_decay=self.l2scale
            )

        if not hasattr(self, "optimizerD"):
            self.optimizerD = optim.Adam(
                self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9))

        assert self.batch_size % 2 == 0
        mean = torch.zeros(self.batch_size, self.embedding_dim, device=self.device)
        std = mean + 1

        steps_per_epoch = max(len(train_data) // self.batch_size, 1)
        for i in range(epochs):
            self.trained_epoches += 1
            for id_ in range(steps_per_epoch):
                fakez = torch.normal(mean=mean, std=std)

                condvec = self.cond_generator.sample(self.batch_size)
                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                    real = data_sampler.sample(self.batch_size, col, opt)
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                    perm = np.arange(self.batch_size)
                    np.random.shuffle(perm)
                    real = data_sampler.sample(self.batch_size, col[perm], opt[perm])
                    c2 = c1[perm]

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                real = torch.from_numpy(real.astype('float32')).to(self.device)

                if c1 is not None:
                    fake_cat = torch.cat([fakeact, c1], dim=1)
                    real_cat = torch.cat([real, c2], dim=1)
                else:
                    real_cat = real
                    fake_cat = fake

                y_fake = self.discriminator(fake_cat)
                y_real = self.discriminator(real_cat)

                pen = self.discriminator.calc_gradient_penalty(
                    real_cat, fake_cat, self.device)
                loss_d = -(torch.mean(y_real) - torch.mean(y_fake))

                self.optimizerD.zero_grad()
                pen.backward(retain_graph=True)
                loss_d.backward()
                self.optimizerD.step()

                fakez = torch.normal(mean=mean, std=std)
                condvec = self.cond_generator.sample(self.batch_size)

                if condvec is None:
                    c1, m1, col, opt = None, None, None, None
                else:
                    c1, m1, col, opt = condvec
                    c1 = torch.from_numpy(c1).to(self.device)
                    m1 = torch.from_numpy(m1).to(self.device)
                    fakez = torch.cat([fakez, c1], dim=1)

                fake = self.generator(fakez)
                fakeact = self._apply_activate(fake)

                if c1 is not None:
                    y_fake = self.discriminator(torch.cat([fakeact, c1], dim=1))
                else:
                    y_fake = self.discriminator(fakeact)

                if condvec is None:
                    cross_entropy = 0
                else:
                    cross_entropy = self._cond_loss(fake, c1, m1)

                loss_g = -torch.mean(y_fake) + cross_entropy

                self.optimizerG.zero_grad()
                loss_g.backward()
                self.optimizerG.step()

            l_average = (loss_g.item() + loss_d.item()) / 2
            print("Epoch %d, Loss Generator: %.4f, Loss Discriminator: %.4f" %
                  (self.trained_epoches, loss_g.detach().cpu(), loss_d.detach().cpu()), flush=True)

    def sample(self, n, condition_column=None, condition_value=None):
        if condition_column is not None and condition_value is not None:
            condition_info = self.transformer.covert_column_name_value_to_id(
                condition_column, condition_value)
            global_condition_vec = self.cond_generator.generate_cond_from_condition_column_info(
                condition_info, self.batch_size)
        else:
            global_condition_vec = None

        steps = n // self.batch_size + 1
        data = []
        for i in range(steps):
            mean = torch.zeros(self.batch_size, self.embedding_dim)
            std = mean + 1
            fakez = torch.normal(mean=mean, std=std).to(self.device)

            if global_condition_vec is not None:
                condvec = global_condition_vec.copy()
            else:
                condvec = self.cond_generator.sample_zero(self.batch_size)

            if condvec is None:
                pass
            else:
                c1 = condvec
                c1 = torch.from_numpy(c1).to(self.device)
                fakez = torch.cat([fakez, c1], dim=1)

            fake = self.generator(fakez)
            fakeact = self._apply_activate(fake)
            data.append(fakeact.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:n]

        return self.transformer.inverse_transform(data, None)

    def save(self, path):
        assert hasattr(self, "generator")
        assert hasattr(self, "discriminator")
        assert hasattr(self, "transformer")

        device_bak = self.device
        self.device = torch.device("cpu")
        self.generator.to(self.device)
        self.discriminator.to(self.device)

        torch.save(self, path)

        self.device = device_bak
        self.generator.to(self.device)
        self.discriminator.to(self.device)

    @classmethod
    def load(cls, path):
        model = torch.load(path)
        model.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model.generator.to(model.device)
        model.discriminator.to(model.device)
        return model

In [30]:
# Specifying columns from dataframe
discrete_cols = df.columns.values
# Defining PSGANS
model = PSGANS()
# Training PSGANS model
model.fit(df, discrete_cols, epochs = 100)

Epoch 1, Loss Generator: 0.5926, Loss Discriminator: -0.0462
Epoch 2, Loss Generator: 0.4152, Loss Discriminator: -0.0308
Epoch 3, Loss Generator: 0.1879, Loss Discriminator: -0.0324
Epoch 4, Loss Generator: 0.1707, Loss Discriminator: -0.0375
Epoch 5, Loss Generator: 0.1892, Loss Discriminator: 0.0205
Epoch 6, Loss Generator: 0.0555, Loss Discriminator: -0.1240
Epoch 7, Loss Generator: 0.0510, Loss Discriminator: -0.0306
Epoch 8, Loss Generator: -0.1899, Loss Discriminator: -0.0104
Epoch 9, Loss Generator: -0.1805, Loss Discriminator: -0.1138
Epoch 10, Loss Generator: -0.2340, Loss Discriminator: -0.0835
Epoch 11, Loss Generator: -0.1612, Loss Discriminator: -0.0323
Epoch 12, Loss Generator: -0.2830, Loss Discriminator: -0.0160
Epoch 13, Loss Generator: -0.2712, Loss Discriminator: -0.0053
Epoch 14, Loss Generator: -0.4481, Loss Discriminator: -0.0989
Epoch 15, Loss Generator: -0.3588, Loss Discriminator: -0.0550
Epoch 16, Loss Generator: -0.6495, Loss Discriminator: 0.0794
Epoch 17, 

In [31]:
# Generating 1000 Fake Data
samples = model.sample(1000) # Change the parameter to get desired number of fake data

# Saving the data into a csv file
df.to_csv('Fake_Data.csv')

In [32]:
# Checking for "1 0 1" combination in column 2, 3, 4 respectively
samples.loc[(samples['2'] == 1) & (samples['3'] == 0) & (samples['4'] == 1)]

Unnamed: 0,2,3,4,5,6


In [33]:
# Checking for "0 0 1" combination in column 2, 3, 4 respectively
samples.loc[(samples['2'] == 0) & (samples['3'] == 0) & (samples['4'] == 1)]

Unnamed: 0,2,3,4,5,6


In [34]:
print("Fake Data Distribution in column 2: ")
print(samples['2'].value_counts())
print("\nFake Data Distribution in column 3: ")
print(samples['3'].value_counts())
print("\nFake Data Distribution in column 4: ")
print(samples['4'].value_counts())
print("\nFake Data Distribution in column 5: ")
print(samples['5'].value_counts())
print("\nFake Data Distribution in column 6: ")
print(samples['6'].value_counts())

Fake Data Distribution in column 2: 
0    522
1    473
Name: 2, dtype: int64

Fake Data Distribution in column 3: 
1    656
0    339
Name: 3, dtype: int64

Fake Data Distribution in column 4: 
0    885
1    110
Name: 4, dtype: int64

Fake Data Distribution in column 5: 
1     127
3     116
5     108
0      96
6      95
2      94
4      79
7      59
8      54
11     43
9      39
10     35
12     20
13     13
14     12
15      3
19      1
16      1
Name: 5, dtype: int64

Fake Data Distribution in column 6: 
1    995
Name: 6, dtype: int64


In [35]:
print("Data Distribution in column 2: ")
print(df['2'].value_counts())
print("\nData Distribution in column 3: ")
print(df['3'].value_counts())
print("\nData Distribution in column 4: ")
print(df['4'].value_counts())
print("\nData Distribution in column 5: ")
print(df['5'].value_counts())
print("\nData Distribution in column 6: ")
print(df['6'].value_counts())

Data Distribution in column 2: 
0    16978
1    16892
Name: 2, dtype: int64

Data Distribution in column 3: 
1    18565
0    15305
Name: 3, dtype: int64

Data Distribution in column 4: 
0    32197
1     1673
Name: 4, dtype: int64

Data Distribution in column 5: 
1     5521
2     5194
3     3979
4     3129
5     2589
6     2483
7     2149
8     1907
0     1673
9     1553
10    1155
11     942
12     619
13     433
14     247
15     147
16      78
17      54
18      16
19       2
Name: 5, dtype: int64

Data Distribution in column 6: 
1    33870
Name: 6, dtype: int64


In [36]:
fakeCol2 = samples['2'].value_counts()
fakeCol3 = samples['3'].value_counts()
fakeCol4 = samples['4'].value_counts()

realCol2 = df['2'].value_counts()
realCol3 = df['3'].value_counts()
realCol4 = df['4'].value_counts()

fakeCol2Ratio = fakeCol2[0] / ( fakeCol2[0] + fakeCol2[1]  )
fakeCol3Ratio = fakeCol3[0] / ( fakeCol3[0] + fakeCol3[1]  )
fakeCol4Ratio = fakeCol4[0] / ( fakeCol4[0] + fakeCol4[1]  )

realCol2Ratio = realCol2[0] / ( realCol2[0] + realCol2[1]  )
realCol3Ratio = realCol3[0] / ( realCol3[0] + realCol3[1]  )
realCol4Ratio = realCol4[0] / ( realCol4[0] + realCol4[1]  )


print("Column 2 (Buy): Fake Ratio: ", fakeCol2Ratio, "Real Ratio: ", realCol2Ratio)
print("Column 3 (New (not cancel)): Fake Ratio: ", fakeCol3Ratio, "Real Ratio: ", realCol3Ratio)
print("Column 4 (Limit): Fake Ratio: ", fakeCol4Ratio, "Real Ratio: ", realCol4Ratio)

Column 2 (Buy): Fake Ratio:  0.5246231155778894 Real Ratio:  0.501269560082669
Column 3 (New (not cancel)): Fake Ratio:  0.3407035175879397 Real Ratio:  0.45187481547091823
Column 4 (Limit): Fake Ratio:  0.8894472361809045 Real Ratio:  0.9506052553882491
