In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [3]:
pd_options = pd.options.display # Objeto para configuração do ambiente: número de linhas, colunas, precisão e etc

In [4]:
pd_options.max_columns = 50 # número máximo de colunas
pd_options.max_rows = 50 # número máximo de linhas
pd_options.precision = 3 # número de casas decimais
pd_options.float_format = "{:,.2f}".format

In [None]:
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

In [5]:
from IPython.core.display import display, HTML
import base64

In [6]:
!pip install facets-overview==1.0.0

Collecting facets-overview==1.0.0
  Downloading https://files.pythonhosted.org/packages/df/8a/0042de5450dbd9e7e0773de93fe84c999b5b078b1f60b4c19ac76b5dd889/facets_overview-1.0.0-py2.py3-none-any.whl
Installing collected packages: facets-overview
Successfully installed facets-overview-1.0.0


In [7]:
from facets_overview.feature_statistics_generator import FeatureStatisticsGenerator

In [8]:
url_train = 'https://raw.githubusercontent.com/cryssoga/DSWP/master/Li%C3%A7%C3%A3o/desafio_train.csv'
df_train = pd.read_csv(url_train)
df_train.set_index('id',inplace=True)

In [9]:
url_test = 'https://raw.githubusercontent.com/cryssoga/DSWP/master/Li%C3%A7%C3%A3o/desafio_test.csv'
df_test = pd.read_csv(url_test)
df_test.set_index('id',inplace=True)

In [10]:
f'"df_train.shape:":{df_train.shape}, "df_test.shape:": {df_test.shape}'

'"df_train.shape:":(11033, 62), "df_test.shape:": (1000, 61)'

In [11]:
df_total = pd.concat([df_train,df_test])
f'"df_total.shape:":{df_total.shape}'

'"df_total.shape:":(12033, 62)'

In [12]:
def mostra_dados(df):
    fsg = FeatureStatisticsGenerator()
    dataframes = [{'table': df, 'name': 'dados de treinamento'}]

    censusProto = fsg.ProtoFromDataFrames(dataframes)
    protostr = base64.b64encode(censusProto.SerializeToString()).decode("utf-8")

    HTML_TEMPLATE = """<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
        <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
        <facets-overview id="elem"></facets-overview>
        <script>
          document.querySelector("#elem").protoInput = "{protostr}";
        </script>"""
    html = HTML_TEMPLATE.format(protostr=protostr)
    display(HTML(html))

In [14]:
mostra_dados(df_train)

In [15]:
!pip install pycaret

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/33/4d/792832e86c34eb7f8c06f1805f19ef72a2d38b11435502b69fca3409b84c/pycaret-2.2.2-py3-none-any.whl (249kB)
[K     |█▎                              | 10kB 15.6MB/s eta 0:00:01[K     |██▋                             | 20kB 20.6MB/s eta 0:00:01[K     |████                            | 30kB 23.3MB/s eta 0:00:01[K     |█████▎                          | 40kB 13.2MB/s eta 0:00:01[K     |██████▋                         | 51kB 8.9MB/s eta 0:00:01[K     |███████▉                        | 61kB 7.8MB/s eta 0:00:01[K     |█████████▏                      | 71kB 8.4MB/s eta 0:00:01[K     |██████████▌                     | 81kB 8.8MB/s eta 0:00:01[K     |███████████▉                    | 92kB 8.7MB/s eta 0:00:01[K     |█████████████▏                  | 102kB 8.6MB/s eta 0:00:01[K     |██████████████▍                 | 112kB 8.6MB/s eta 0:00:01[K     |███████████████▊                | 122kB 8.6MB/s eta

In [16]:
import pycaret
from pycaret import regression, classification

In [None]:
from pycaret.utils import enable_colab
enable_colab()

In [None]:
def pycaret_classificacao(df, target):
    print(f'Modelo: ml_{target}')
    print(f'Este é um problema de classificação')

    ml = classification.setup(data = df,
                          target = target,
                          session_id = 20111974,
                          feature_selection = True,
                          train_size = 0.8,
                          normalize = True, normalize_method = 'robust',
                          feature_interaction = True,
                          feature_ratio = True,
                          combine_rare_levels = True,
                          remove_multicollinearity = True,
                          profile = True,
                          fix_imbalance = False)
    return ml

In [None]:
def pycaret_regressao(df, target):
    print(f'Modelo: ml_{target}')
    print(f'Este é um problema de regressão')

    ml = regression.setup(data = df,
                          target = target,
                          session_id = 20111974,
                          feature_selection = True,
                          train_size = 0.8,
                          normalize = True, normalize_method = 'robust',
                          feature_interaction = True,
                          feature_ratio = True,
                          combine_rare_levels = True,
                          remove_multicollinearity = True,
                          profile = True,
                          imimputation_type: str='simple', iterative_imputation_iters: int=5)
    return ml

In [None]:
imputation_type = 'iterative', 
             iterative_imputation_iters = 20, 
             categorical_iterative_imputer = 'knn',
             numeric_iterative_imputer = 'knn'

In [None]:
def prepara_dataframes(df, target, l_colunas_mv):
    df2 = 'df' + '_' + target + '_sem_mv'
    #print(df2)

    l_colunas_mv.remove(target)
    print(f'Features ignoradas: {l_colunas_mv}')
    df2 = df.copy()
    df3 = df.copy()

    df2 = df2.dropna() # Excluir todas as linhas com missing values
    df2 = df2.drop(columns = ['Churn'], axis = 1)

    # Para reduzir o viés, vamos dropar/deletar as features que são missing values
    df2 = df2.drop(columns = l_colunas_mv, axis = 1)
    #print(df2.isna().sum())

    # Apontar os missing values no dataframe original:
    df3[target+'_mv'] = np.where(df3[target].isna(), 1, 0)

    return df2, df3

In [None]:
l_colunas_mv = ['TotalCharges2', 'tenure', 'PaymentMethod', 'Dependents']

In [None]:
l_colunas_mv = ['TotalCharges2', 'tenure', 'PaymentMethod', 'Dependents']

for target in l_colunas_mv:
    l_colunas_mv_2 = ['TotalCharges2', 'tenure', 'PaymentMethod', 'Dependents']
    exec(f"df_{target}_sem_mv, df_T = prepara_dataframes(df_T, target, l_colunas_mv_2)")

Tratamento dos CNAE

In [None]:
# cnae.csv : arquivo csv que contém número da divisão ('cnae') e letra da seção do cnae ('secao'), exemplificando:
# secao: C => indústria de transformação,
    # divisão: 10 => fabricação de produtos alimentícios 
    # divisão: 11 => fabricação de bebidas
# são 21 seções (A-U) totalizando 99 divisões
# df_cnae: dataframe indexado pela divisão 'cnae':
df_cnae = pd.read_csv('https://raw.githubusercontent.com/cryssoga/DSWP/master/Dataframes/cnae.csv')
df_cnae.set_index(['cnae'], inplace= True)

In [None]:
df_cnae.dtypes

secao    object
dtype: object

In [None]:
df_cnae.index

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
            52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
            69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
            86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
           dtype='int64', name='cnae')

In [None]:
# criando dicionário d_cnae a partir de df_cnae, com as 99 divisões correspondentes às 11 seções:
d_cnae = {}
for i in np.arange(1,len(df_cnae)):
    d_cnae[i]=df_cnae['secao'][i]

In [None]:
d_cnae

In [None]:
# função que cria em df a coluna 'cnae_secao' com a seção correspondente à variável 'cnae2' existente em df:
def cria_cnae_secao(df):
    df['cnae_secao'] = df['cnae2'].replace(to_replace= d_cnae)
    return df   

In [None]:
# listas com variáveis/colunas relacionadas:
l_categoricas = ['cnae2', 'rf2','cnae_secao']
l_ind_1 = ['ind01', 'ind02', 'ind03', 'ind04', 'ind05', 'ind06', 'ind07', 'ind08', 'ind09', 'ind10',
           'ind11', 'ind12', 'ind13', 'ind14', 'ind15', 'ind16', 'ind17', 'ind18', 'ind19', 'ind20',
           'ind28', 'ind29', 'ind30', 'ind31', 'ind32', 'ind33', 'ind34', 'ind35', 'ind36', 'ind37',
           'ind40', 'ind41']
l_ind_2 = ['ind21', 'ind22', 'ind23', 'ind24', 'ind25', 'ind26', 'ind27',
           'ind38', 'ind39',
           'ind42', 'ind43']
l_mc_1 = ['mc1','mc2','mc3']
l_mc_2 = ['mc4']

Tratar NaNs

Grupo l_ind_1

In [None]:
# função que trata NaN das colunas na lista l_ind_1 (moda) e cria coluna 'l_ind_1_mv':
def trata_l_ind_1(df):
    l_ind_1 = ['ind01', 'ind02', 'ind03', 'ind04', 'ind05', 'ind06', 'ind07', 'ind08', 'ind09', 'ind10',
               'ind11', 'ind12', 'ind13', 'ind14', 'ind15', 'ind16', 'ind17', 'ind18', 'ind19', 'ind20',
               'ind28', 'ind29', 'ind30', 'ind31', 'ind32', 'ind33', 'ind34', 'ind35', 'ind36', 'ind37',
               'ind40', 'ind41']
    df['l_ind_1_mv'] = 0
    df['l_ind_1_mv'][df['ind01'].isna()] = 1
    for i in l_ind_1:
        moda = df[i].mode()[0]
        df[i].fillna(value = moda, inplace=True)
    return df

Grupo l_ind_2

In [None]:
# função que trata NaN das coluas na lista l_ind_2 (moda) e cria coluna 'l_ind_2_mv':
def trata_l_ind_2(df):
    l_ind_2 = ['ind21', 'ind22', 'ind23', 'ind24', 'ind25', 'ind26', 'ind27',
               'ind38', 'ind39',
               'ind42', 'ind43']
    df['l_ind_2_mv'] = 0
    df['l_ind_2_mv'][df['ind21'].isna()] = 1
    for i in l_ind_2:
        moda = df[i].mode()[0]
        df[i].fillna(value = moda, inplace=True)
    return df

Grupo l_mc_1

In [None]:
# função que trata NaN das colunas na lista l_mc_1 (mediana):
def trata_l_mc_1(df):
    l_mc_1 = ['mc1','mc2','mc3']
    df['l_mc_1_mv'] = 0
    df['l_mc_1_mv'][df['mc1'].isna()] = 1
    for i in l_mc_1:
        mediana = df[i].median()
        df[i].fillna(value = mediana, inplace=True)
    return df

Grupo l_md_1

In [None]:
df_total['mc4'].describe()

count   12033.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         1.0
Name: mc4, dtype: float64

In [None]:
for i in ['md1', 'md2', 'md3', 'md4', 'md5', 'md6', 'md7', 'md8',
       'md9', 'md10', 'md11', 'md12','mc1','mc2','mc3','mc4']:
    print(i,len(df_total[i].unique()),df_total[i].max())

md1 9597 1.0
md2 11961 1.0
md3 11961 1.0000000000000002
md4 8901 1.0
md5 6107 1.0
md6 412 1.0
md7 11966 1.0
md8 11964 1.0
md9 9024 1.0
md10 6276 1.0
md11 370 1.0
md12 11966 1.0
mc1 10834 1.0
mc2 1632 1.0
mc3 8924 1.0
mc4 11413 1.0


In [None]:
# função:
def cria_colunas_o(df):
    df[['md1_o', 'md2_o', 'md3_o', 'md4_o', 'md5_o', 'md6_o', 'md7_o', 'md8_o',
       'md9_o', 'md10_o', 'md11_o', 'md12_o','mc1_o','mc2_o','mc3_o','mc4_o']] = df[['md1', 'md2', 'md3', 'md4', 'md5', 'md6', 'md7', 'md8',
       'md9', 'md10', 'md11', 'md12','mc1','mc2','mc3','mc4']].copy()
    return df

In [None]:
len(df_total.columns)

82

In [None]:
# NÃO USAR:
# trata os outliers das colunas criadas com a função cria_colunas_o(df):

def trata_outliers(df):
    l_mc_outliers = ['md1_o', 'md2_o', 'md3_o', 'md4_o', 'md5_o', 'md6_o',
                     'md7_o', 'md8_o', 'md9_o', 'md10_o', 'md11_o', 'md12_o',
                     'mc1_o','mc2_o','mc3_o','mc4_o']
    for i in l_mc_outliers:
        q1 = np.percentile(df[i],25)
        q3 = np.percentile(df[i],75)
        iqr = q3 - q1
        lim_inf = q1 - 1.5 * iqr
        lim_sup = q3 + 1.5 * iqr
        df[i][df[i] < lim_inf] = lim_inf
        df[i][df[i] > lim_sup] = lim_sup
    return df

In [None]:
# NÃO USAR:

def f_normaliza(df_norm):
    #print(df_norm.type)
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    modelo = MinMaxScaler()
    #df_norm = df_norm.reshape(-1, 1) 
    df_norm= modelo.fit_transform(df_norm)
    return df_norm

In [None]:
# NÃO USAR:
l_mc_outliers = ['md1_o', 'md2_o', 'md3_o', 'md4_o', 'md5_o', 'md6_o',
                 'md7_o', 'md8_o', 'md9_o', 'md10_o', 'md11_o', 'md12_o',
                 'mc1_o','mc2_o','mc3_o','mc4_o']
def f_trata_col(df):
    for i in l_mc_outliers:
        q1 = np.percentile(df[i],25)
        q3 = np.percentile(df[i],75)
        iqr = q3 - q1
        lim_inf = q1 - 1.5 * iqr
        lim_sup = q3 + 1.5 * iqr
        df[i][df[i] < lim_inf] = lim_inf
        df[i][df[i] > lim_sup] = lim_sup
    df[l_mc_outliers] = f_normaliza(df[l_mc_outliers])
    for i in l_mc_outliers:
        if df[i][df[i] == 0].shape == df[i].shape or df[i][df[i] == 1].shape == df[i].shape:
            df = df.drop(columns = i)
    df['rf2'] = df['rf2'].astype('category')
    df['cnae2'] = df['cnae2'].astype('category')
    df['cnae_secao'] = df['cnae_secao'].astype('category')
    #df = pd.get_dummies(df)
    return df

In [None]:
df_total = f_trata_col(df_total)

In [None]:
cria_cnae_secao(df_total)
trata_l_ind_1(df_total)
trata_l_ind_2(df_total)
trata_l_mc_1(df_total)
df_total = cria_colunas_o(df_total)
df_total = f_trata_col(df_total)

In [None]:
df_total.shape

(12033, 79)

In [None]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12033 entries, 0 to 8582
Data columns (total 79 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   cnae2       12033 non-null  category
 1   rf2         12033 non-null  category
 2   md1         12033 non-null  float64 
 3   md2         12033 non-null  float64 
 4   md3         12033 non-null  float64 
 5   md4         12033 non-null  float64 
 6   md5         12033 non-null  float64 
 7   md6         12033 non-null  float64 
 8   md7         12033 non-null  float64 
 9   md8         12033 non-null  float64 
 10  md9         12033 non-null  float64 
 11  md10        12033 non-null  float64 
 12  md11        12033 non-null  float64 
 13  md12        12033 non-null  float64 
 14  mc1         12033 non-null  float64 
 15  mc2         12033 non-null  float64 
 16  mc3         12033 non-null  float64 
 17  mc4         12033 non-null  float64 
 18  ind01       12033 non-null  float64 
 19  ind02

In [None]:
df_total.columns

Index(['cnae2', 'rf2', 'md1', 'md2', 'md3', 'md4', 'md5', 'md6', 'md7', 'md8',
       'md9', 'md10', 'md11', 'md12', 'mc1', 'mc2', 'mc3', 'mc4', 'ind01',
       'ind02', 'ind03', 'ind04', 'ind05', 'ind06', 'ind07', 'ind08', 'ind09',
       'ind10', 'ind11', 'ind12', 'ind13', 'ind14', 'ind15', 'ind16', 'ind17',
       'ind18', 'ind19', 'ind20', 'ind21', 'ind22', 'ind23', 'ind24', 'ind25',
       'ind26', 'ind27', 'ind28', 'ind29', 'ind30', 'ind31', 'ind32', 'ind33',
       'ind34', 'ind35', 'ind36', 'ind37', 'ind38', 'ind39', 'ind40', 'ind41',
       'ind42', 'ind43', 'target', 'cnae_secao', 'l_ind_1_mv', 'l_ind_2_mv',
       'l_mc_1_mv', 'md1_o', 'md2_o', 'md3_o', 'md4_o', 'md5_o', 'md7_o',
       'md8_o', 'md9_o', 'md10_o', 'md12_o', 'mc1_o', 'mc3_o', 'mc4_o'],
      dtype='object')

In [None]:
df_t = df_total.copy()

In [None]:
df_t.drop(['cnae2','rf2','ind29', 'ind30', 'ind33', 'ind40','ind41'],axis=1, inplace=True)

In [None]:
df_t

Unnamed: 0_level_0,md1,md2,md3,md4,md5,md6,md7,md8,md9,md10,md11,md12,mc1,mc2,mc3,mc4,ind01,ind02,ind03,ind04,ind05,ind06,ind07,ind08,ind09,ind10,ind11,ind12,ind13,ind14,ind15,ind16,ind17,ind18,ind19,ind20,ind21,ind22,ind23,ind24,ind25,ind26,ind27,ind28,ind31,ind32,ind34,ind35,ind36,ind37,ind38,ind39,ind42,ind43,target,cnae_secao,l_ind_1_mv,l_ind_2_mv,l_mc_1_mv,md1_o,md2_o,md3_o,md4_o,md5_o,md7_o,md8_o,md9_o,md10_o,md12_o,mc1_o,mc3_o,mc4_o
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
0,0.090339,0.011256,0.111095,0.003233,0.003233,0.0,0.016366,0.021082,0.004541,0.004541,0.0,0.130930,0.009247,0.001729,0.000138,0.004412,0.0000,0.0281,0.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,1.0000,0.0000,0.00,0.00,0.0000,0.0000,0.0976,0.0333,0.1000,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,True,Q,0,0,0,1.000000,0.398255,0.407658,0.114456,0.308309,0.453559,0.384122,0.111722,0.249513,0.393614,1.000000,0.044187,1.000000
1,0.005996,0.019476,0.124770,0.000000,0.038168,0.0,0.029214,0.046445,0.000000,0.049919,0.0,0.149741,0.003186,0.007671,0.006943,0.003589,0.0000,0.0000,1.0,0.4167,0.4194,0.7068,0.0076,0.0076,0.0,0.0,0.0,0.0856,0.0286,0.0,0.0,0.0,0.0,0.0,0.0000,0.7625,1.00,1.00,0.1429,0.2857,0.0000,0.4444,0.2222,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,False,C,0,0,0,0.397077,0.689104,0.844217,0.000000,1.000000,0.809609,0.908028,0.000000,1.000000,1.000000,0.807546,1.000000,1.000000
2,0.000006,0.002902,0.110160,0.002274,0.002274,0.0,0.002902,0.020058,0.003131,0.003131,0.0,0.130405,0.000015,0.000000,0.000050,0.000209,0.0000,0.0000,1.0,1.0000,1.0000,1.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,1.0000,1.0000,1.00,1.00,1.0000,1.0000,0.0000,1.0000,1.0000,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,False,M,0,0,0,0.000383,0.102683,0.377814,0.080514,0.216878,0.080427,0.362960,0.077037,0.172051,0.352255,0.003804,0.015874,0.062344
3,0.000009,0.014526,0.120351,0.000000,0.000000,0.0,0.014526,0.032017,0.000000,0.000000,0.0,0.142568,0.000438,0.000000,0.000042,0.000115,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.2309,0.2309,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,1.0000,0.25,0.25,0.9167,0.9167,0.7857,0.1667,0.1667,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,False,H,0,0,0,0.000577,0.513947,0.703152,0.000000,0.000000,0.402548,0.609998,0.000000,0.000000,1.000000,0.111085,0.013452,0.034155
4,0.000191,0.004042,0.111078,0.001121,0.001121,0.0,0.006063,0.023705,0.011886,0.011886,0.0,0.131285,0.001925,0.004829,0.006355,0.000719,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,1.0000,1.0000,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.00,0.00,0.0000,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.5,False,G,0,0,0,0.012644,0.143017,0.407120,0.039687,0.106904,0.168026,0.438292,0.292442,0.653124,0.421618,0.487878,1.000000,0.214273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8609,0.000000,0.001109,0.109456,0.000000,0.000000,0.0,0.000832,0.018566,0.000000,0.000000,0.0,0.130563,0.000115,0.000000,0.000000,0.000111,0.0833,0.0833,1.0,0.2500,0.2500,0.0000,1.0000,1.0000,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0406,1.0000,1.00,1.00,0.2222,0.2222,0.0000,0.2500,0.1667,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,H,0,0,0,0.000000,0.039231,0.355359,0.000000,0.000000,0.023046,0.332143,0.000000,0.000000,0.364700,0.029202,0.000000,0.033027
3384,0.000349,0.002435,0.110167,0.000726,0.001945,0.0,0.003653,0.021337,0.002113,0.004956,0.0,0.131340,0.004253,0.000160,0.003621,0.003736,0.0000,0.6888,0.0,0.8333,0.8333,1.0000,0.0000,0.0000,0.0,0.0,0.0,0.1738,0.0797,0.0,0.0,0.0,0.0,0.0,0.0000,1.0000,0.25,0.25,0.5000,0.5000,0.1250,1.0000,1.0000,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,G,0,0,0,0.023108,0.086158,0.378035,0.025699,0.185466,0.101224,0.389382,0.051982,0.272333,0.425957,1.000000,1.000000,1.000000
8297,0.008200,0.006795,0.112855,0.007905,0.000000,0.0,0.007726,0.024137,0.011176,0.000000,0.0,0.130824,0.001282,0.000000,0.000257,0.000851,0.0000,0.0000,1.0,1.0000,1.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0699,0.0,0.0,0.0,0.0,0.0,1.0000,1.0000,1.00,1.00,1.0000,1.0000,0.0000,1.0000,0.8333,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,,F,0,0,0,0.542978,0.240417,0.463862,0.279860,0.000000,0.214098,0.447212,0.274964,0.000000,0.385308,0.324917,0.082238,0.253791
7556,0.036538,0.017085,0.121296,0.000064,0.000064,0.0,0.017085,0.031785,0.002432,0.000000,0.0,0.142918,0.011101,0.000000,0.000973,0.001536,0.0000,0.4692,1.0,1.0000,1.0000,0.0877,1.0000,1.0000,0.0,0.0,0.0,0.0209,1.0000,0.0,0.0,0.0,0.0,0.0,0.0000,1.0000,1.00,1.00,1.0000,1.0000,0.0000,0.7222,1.0000,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.5,,Q,0,0,0,1.000000,0.604485,0.733313,0.002280,0.006141,0.473462,0.605210,0.059826,0.000000,1.000000,1.000000,0.311687,0.458007


In [None]:
df_t = pd.get_dummies(df_t)

In [None]:
df_t.shape

(12033, 91)

In [None]:
train_3 = df_t[0:11033]
test_3 = df_t[11033:]

In [None]:
train_3.to_csv('train_3.csv',index = True, sep = ',')
test_3.to_csv('test_3.csv',index = True, sep = ',')