In [1]:
from ctgan import CTGAN
from sklearn.preprocessing import LabelEncoder
import pandas as pd

train = pd.read_csv('./train.csv')

In [2]:
X = train
Y = X['Education']

In [3]:
def string_to_number(x):
    if isinstance(x, str):
        numeric_part = int(''.join(filter(str.isdigit, x)))
        if 'Crore' in x:
            return numeric_part * 10000000
        elif 'Lac' in x:
            return numeric_part * 100000
        elif 'Thou' in x:
            return numeric_part * 1000
        elif 'Hund' in x:
            return numeric_part * 100
        else:
            return numeric_part
    else:
        return x

def Doctor_Check(x):
    return 'Dr.' in x
def Advocate_Check(x):
    return 'Adv.' in x

In [4]:
train['Total Assets'] = train['Total Assets'].apply(string_to_number)
train['Liabilities'] = train['Liabilities'].apply(string_to_number)

In [5]:
train['SC'] = train['Constituency ∇'].str.contains('SC')
train['ST'] = train['Constituency ∇'].str.contains('ST')
train['Doctor'] = train['Candidate'].apply(Doctor_Check)
train['Advocate'] = train['Candidate'].apply(Advocate_Check)
train = train.drop(['Constituency ∇', 'Candidate', 'ID'], axis=1)
train

Unnamed: 0,Party,Criminal Case,Total Assets,Liabilities,state,Education,SC,ST,Doctor,Advocate
0,DMK,4,2110000000,20000000,TAMIL NADU,8th Pass,False,False,False,False
1,BJP,0,10000000,0,MADHYA PRADESH,12th Pass,True,False,False,False
2,INC,0,70000000,2200000,KARNATAKA,Post Graduate,False,False,True,False
3,BJP,0,90000000,2400000,BIHAR,Post Graduate,False,False,False,False
4,BJP,2,20000000,6100000,WEST BENGAL,8th Pass,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
2054,CPI,1,6100000,1000000,KERALA,Graduate Professional,False,False,False,False
2055,INC,0,20000000,800000,RAJASTHAN,10th Pass,False,False,False,False
2056,BJP,0,130000000,8500000,UTTAR PRADESH,Graduate,False,False,True,False
2057,NCP,1,250000000,9400000,MAHARASHTRA,12th Pass,False,False,False,False


In [6]:
encoder1 = LabelEncoder()
encoder2 = LabelEncoder()
encoder3 = LabelEncoder()

train['Party'] = encoder1.fit_transform(train['Party'])
train['state'] = encoder2.fit_transform(train['state'])
train['Education'] = encoder3.fit_transform(train['Education'])

columns = train.columns

In [7]:
ctgan = CTGAN(epochs=5000)
ctgan.fit(train, columns)

In [8]:
deep_data = ctgan.sample(2000)
deep_data['Party'] = encoder1.inverse_transform(deep_data['Party'])
deep_data['state'] = encoder2.inverse_transform(deep_data['state'])
deep_data['Education'] = encoder3.inverse_transform(deep_data['Education'])

deep_data

Unnamed: 0,Party,Criminal Case,Total Assets,Liabilities,state,Education,SC,ST,Doctor,Advocate
0,SP,12,8500000,5100000,HARYANA,10th Pass,True,False,False,False
1,RJD,49,6200000,180000000,ARUNACHAL PRADESH,Others,True,True,True,False
2,NPP,5,1200000,2500000,KERALA,Literate,True,False,True,False
3,DMK,8,100000000,7200000,UTTAR PRADESH,Graduate,False,False,True,True
4,SP,26,12670000000,46000,MEGHALAYA,Doctorate,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
1995,TDP,18,1060000000,33000,UTTAR PRADESH,Literate,False,False,False,True
1996,NDPP,1,2200000,1000000,UTTAR PRADESH,10th Pass,False,False,True,True
1997,SP,11,1890000000,370000000,UTTAR PRADESH,Graduate,False,False,False,True
1998,TDP,17,440000000,8900000,MADHYA PRADESH,5th Pass,True,True,False,False


In [9]:
deep_data.to_csv('deep_data_temp.csv', index=False)