In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans

In [2]:
def obtenerDataTrain():
    train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
    train_values = pd.read_csv('train_values.csv', index_col='building_id')
    return train_values,train_labels

def obtenerDataTest():
    return pd.read_csv('test_values.csv', index_col='building_id')

def conversionNumericaOneHot(df_datos):
    columnas_a_codificar = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
                          'other_floor_type', 'position', 'legal_ownership_status', 'plan_configuration']
    datos_a_codificar = df_datos[columnas_a_codificar]
    datos_codificados, nombres_features_codificados = codificacionOneHot(datos_a_codificar)
    
    datos_numericos= df_datos[[i for i in list(df_datos.columns) if i not in columnas_a_codificar]]
    datos_juntos = np.hstack((datos_codificados,np.array(datos_numericos)))

    return datos_juntos

def codificacionOneHot(datos_a_codificar):
    encoder = OneHotEncoder(drop='first', sparse=False)
    datos_codificados = encoder.fit_transform(datos_a_codificar)
    nombres_de_los_features = encoder.get_feature_names(datos_a_codificar.columns)
    
    return datos_codificados, nombres_de_los_features

def agregarColumnaBarroMaderaCementoPiedra(data):
    data_r=data
    data_r['construido_con_barro'] = data['has_superstructure_mud_mortar_brick']+data['has_superstructure_adobe_mud'] \
    +data['has_superstructure_mud_mortar_stone']
    barro = [ 1 if l>0 else 0 for l in data_r['construido_con_barro'].values]
    data_r['construido_con_barro']=barro

    data_r['construido_con_madera'] = data['has_superstructure_timber']+data['has_superstructure_bamboo']
    madera = [ 1 if l>0 else 0 for l in data_r['construido_con_madera'].values]
    data_r['construido_con_madera']=madera

    data_r['construido_con_piedra'] = data['has_superstructure_mud_mortar_stone']+data['has_superstructure_stone_flag']\
                                        +data['has_superstructure_cement_mortar_stone']
    piedra = [ 1 if l>0 else 0 for l in data_r['construido_con_piedra'].values]
    data_r['construido_con_piedra']=piedra
    
    
    data_r.drop(['has_superstructure_mud_mortar_brick','has_superstructure_adobe_mud','has_superstructure_mud_mortar_stone'\
              ,'has_superstructure_timber','has_superstructure_bamboo','has_superstructure_mud_mortar_stone'\
            ,'has_superstructure_timber','has_superstructure_cement_mortar_stone','has_superstructure_stone_flag'],axis=1,\
                inplace=True)

    return data_r

def corregirAge(df_values):
    df_r = df_values
    data = df_r['age']
    
    winsorized_data = winsorize(data,(0, 0.05))
    df_r['age'] = winsorized_data
    
    return df_r

def corregirArea(df_values):
    df_r = df_values
    data = df_r['area_percentage']
    
    winsorized_data = winsorize(data,(0, 0.055))
    df_r['area_percentage'] = winsorized_data
    
    return df_r

def corregiraAltura(df_values):
    df_r = df_values
    data = df_r['height_percentage']
    
    winsorized_data = winsorize(data,(0, 0.04))
    df_r['height_percentage'] = winsorized_data
    
    return df_r

def limpiarColumnsSec(values):
    df_r=values
    df_r['uso_secundario']=df_r['has_secondary_use']+df_r['has_secondary_use_agriculture']+\
                df_r['has_secondary_use_hotel']+df_r['has_secondary_use_gov_office']\
              +df_r['has_secondary_use_rental']+df_r['has_secondary_use_institution']+df_r['has_secondary_use_school']\
              +df_r['has_secondary_use_industry']+df_r['has_secondary_use_industry']+df_r['has_secondary_use_health_post']\
              +df_r['has_secondary_use_use_police']+df_r['has_secondary_use_other']
    
    secundario = [ 1 if l>0 else 0 for l in df_r['uso_secundario'].values]
    df_r['uso_secundario']=secundario
    
    df_r.drop(['has_secondary_use','has_secondary_use_agriculture','has_secondary_use_hotel','has_secondary_use_gov_office'\
              ,'has_secondary_use_rental','has_secondary_use_institution','has_secondary_use_school'\
              ,'has_secondary_use_industry','has_secondary_use_industry','has_secondary_use_health_post'\
              ,'has_secondary_use_use_police','has_secondary_use_other','count_families'], axis=1, inplace=True)
    return df_r

    
def var(x,y):
    return pow(x-y,2)

def agregarVarianzaId1(df_values):
    df_r=df_values
    id_1 = df_values['geo_level_1_id']
    mean_id = df_values['geo_level_1_id'].mean()
    varianza = [ var(x,mean_id) for x in df_values['geo_level_1_id'].values]
    df_r['var_geo_level_1_id']=varianza
    return df_r


def agregarFeatures(data):
    col_superstructure = ['has_superstructure_adobe_mud',
          'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
          'has_superstructure_cement_mortar_stone',
          'has_superstructure_mud_mortar_brick',
          'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
          'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
          'has_superstructure_rc_engineered', 'has_superstructure_other']
    col_mortar = ['has_superstructure_mud_mortar_stone', 'has_superstructure_cement_mortar_stone',
                  'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick']
    col_mud = ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone',
                'has_superstructure_mud_mortar_brick']
    col_stone = ['has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
                  'has_superstructure_cement_mortar_stone']
    col_cement = ['has_superstructure_cement_mortar_stone', 'has_superstructure_cement_mortar_brick']
    col_brick = ['has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick']
    col_wood = ['has_superstructure_timber','has_superstructure_bamboo']

    col_geolevel1 = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
    data['geolevel_sum1'] = data[col_geolevel1].sum(axis=1)

    col_geolevel2 = ['geo_level_2_id', 'geo_level_3_id']
    data['geolevel_sum2'] = data[col_geolevel2].sum(axis=1)

    data['wood_count'] = data[col_wood].sum(axis=1)
    #habria que ver si tiene sentido sumar los superstructure
    data['superstructure_count'] = data[col_superstructure].sum(axis=1)
    #habria que ver si tiene sentido sumar los mortar
    data['mortar_count'] = data[col_mortar].sum(axis=1)
    data['mud_count'] = data[col_mud].sum(axis=1)
    data['stone_count'] = data[col_stone].sum(axis=1)
    data['cement_count'] = data[col_cement].sum(axis=1)
    data['brick_count'] = data[col_brick].sum(axis=1)
    #habria que chequear el nivel de importancia
    data['family_per_floor'] = data['count_families']/data['count_floors_pre_eq']
    data['volumen_percentage'] = data['area_percentage']*data['height_percentage']

    data.drop(col_wood+col_mortar+col_mud+col_stone+col_cement+col_brick+col_wood,axis=1,inplace=True)

    return data

def dummyEncoding(data):
    categorical_features = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
                          'other_floor_type']
    data = pd.get_dummies(data,columns=categorical_features,drop_first=True)
    
    data.drop(['position', 'legal_ownership_status', 'plan_configuration'], axis=1, inplace=True)
    
    return data

def preprocess_numeric_data(data):
    numerical_features = ['age', 'area_percentage', 'height_percentage', 'volumen_percentage']
    scaler = StandardScaler()
    df = pd.DataFrame(scaler.fit_transform(data[numerical_features]), columns=numerical_features)
    data[numerical_features]=df[numerical_features].copy()
    return data

def agregarClusters(data):
    data_r=data
    data_r['2_clusters'] = KMeans(n_clusters=2, random_state=0).fit_predict(data.values)
    data_r['4_clusters'] = KMeans(n_clusters=4, random_state=0).fit_predict(data.values)
    data_r['10_clusters'] = KMeans(n_clusters=10, random_state=0).fit_predict(data.values)
    return data_r

In [3]:
def expandirDataSet(df_values):
    values_process = agregarFeatures(df_values)
    values_process = agregarVarianzaId1(values_process)
    values_process = limpiarColumnsSec(values_process)
    values_process = agregarClusters(values_process)

    return values_process

def prepararDataSet(df_values):
    values_process = corregirAge(df_values)
    values_process = corregirArea(values_process)
    values_process = corregiraAltura(values_process)
    return values_process