In [2]:
import pandas as pd

In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

def cart_feature_selection(df, target_column, n_features=5):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    cart = DecisionTreeClassifier(random_state=42)
    cart.fit(X_train, y_train)
    
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': cart.feature_importances_
    }).sort_values('importance', ascending=False)
    
    selected_features = feature_importance['feature'][:n_features].tolist()
    
    return selected_features



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from deap import creator, base, tools, algorithms

def genetic_feature_selection(df, target_column, n_generations=50, population_size=50):
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register("attr_bool", np.random.randint, 0, 2)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)

    def evaluate(individual):
        selected_features = X.columns[np.array(individual, dtype=bool)]
        if len(selected_features) == 0:
            return 0,
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        scores = cross_val_score(clf, X[selected_features], y, cv=5)
        return np.mean(scores),

    toolbox.register("evaluate", evaluate)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    population = toolbox.population(n=population_size)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=n_generations, verbose=False)

    best_individual = tools.selBest(population, k=1)[0]
    selected_features = X.columns[np.array(best_individual, dtype=bool)].tolist()

    return selected_features

In [179]:
db_1 = pd.read_csv('New_DB_2.csv')
db_0 = pd.read_csv('New_DB_0.csv')
db_1 = db_1.drop(columns='Valor')
db_0 = db_0.drop(columns='Valor')
db_1['Fecha Evento'] = pd.to_datetime(db_1['Fecha Evento'], format='%d/%m/%Y', errors='coerce')
db_0['Fecha Evento'] = pd.to_datetime(db_0['Fecha Evento'], format='%d/%m/%Y', errors='coerce')

In [180]:
raw = pd.read_csv('data_processed.csv')
raw = raw.drop_duplicates(subset=['Latitud', 'Longitud', 'Fecha Evento'])
raw = raw.reset_index()
raw['Fecha Evento'] = pd.to_datetime(raw['Fecha Evento'], format='%d/%m/%Y', errors='coerce')

In [181]:
df_1 = pd.read_csv('test_data_1')
df_0 = pd.read_csv('test_data_0_2')
df_1 = df_1.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
df_0 = df_0.drop(columns=['Unnamed: 0'])
df_1['Valor'] = 1
df_0['Valor'] = 0
df_1['Fecha Evento'] = raw['Fecha Evento']
df_0['Fecha Evento'] = raw['Fecha Evento']
#df_1['Fecha Evento'] = pd.to_datetime(df_1['Fecha Evento'], format='%d/%m/%Y', errors='coerce')
#df_0['Fecha Evento'] = pd.to_datetime(df_0['Fecha Evento'], format='%d/%m/%Y', errors='coerce')
df_1.shape, df_0.shape


((602, 137), (602, 137))

In [182]:
df_0 = pd.merge(df_0 , db_0, on=['Latitud', 'Longitud', 'Fecha Evento'], how='inner')
df_1 = pd.merge(df_1 , db_1, on=['Latitud', 'Longitud', 'Fecha Evento'], how='inner')

In [183]:
df_1 = df_1.drop(columns=['valor_humedad_suelo2','valor_humedad_suelo3', 'valor_humedad_suelo4',
       'Tipo Remoción en masa', 'Unnamed: 0','Región', 'Comuna', 'Factor desencadenante',
       'Sistema Georeferencia', 'Cota (m.s.n.m)', 'Fecha Evento'])
df_0 = df_0.drop(columns=['valor_humedad_suelo2','valor_humedad_suelo3', 'valor_humedad_suelo4',
       'Tipo Remoción en masa', 'Unnamed: 0','Región', 'Comuna', 'Factor desencadenante',
       'Sistema Georeferencia', 'Cota (m.s.n.m)', 'Fecha Evento'])
df_1 = df_1.fillna(df_1.mean())
df_0 = df_0.fillna(df_0.mean())

In [184]:
df = pd.concat([df_0,df_1])
df = df.drop(columns=['Longitud', 'Latitud'])
df_x = df.drop(columns='Valor')
df_y = df.Valor
df_x.shape, df_y.shape

((1070, 136), (1070,))

In [185]:
# Usar CART para seleccionar características
cart_features = cart_feature_selection(df, 'Valor', n_features=5)
print("Características seleccionadas por CART:", cart_features)

# Usar algoritmo genético para seleccionar características
genetic_features = genetic_feature_selection(df, 'Valor')
print("Características seleccionadas por el algoritmo genético:", genetic_features)

Características seleccionadas por CART: ['PIRange_Bulkd.5-15cm.tif', 'PIRange_Clay.0-5cm.tif', 'ksat_30-60cm.tif', 'valor_humedad_suelo1', 'PIRange_Sand.5-15cm.tif']




Características seleccionadas por el algoritmo genético: ['PIRange_Bulkd.0-5cm.tif', 'PIRange_Bulkd.100-200cm.tif', 'PIRange_Bulkd.15-30cm.tif', 'PIRange_Bulkd.30-60cm.tif', 'PIRange_Bulkd.60-100cm.tif', 'PIRange_Clay.0-5cm.tif', 'PIRange_Clay.15-30cm.tif', 'PIRange_Clay.30-60cm.tif', 'PIRange_Sand.0-5cm.tif', 'PIRange_Sand.100-200cm.tif', 'PIRange_Sand.60-100cm.tif', 'alpha_30-60cm.tif', 'alpha_60-100cm.tif', 'AvMoist.0-5cm.tif', 'AWC_0-5cm.tif', 'AWC_100-200cm.tif', 'AWC_5-15cm.tif', 'AWC_60-100cm.tif', 'FC.0-5cm.tif', 'FC.100-200cm.tif', 'FC.15-30cm.tif', 'FC.30-60cm.tif', 'FC.60-100cm.tif', 'ksat_15-30cm.tif', 'ksat_30-60cm.tif', 'ksat_60-100cm.tif', 'n_15-30cm.tif', 'n_5-15cm.tif', 'PWP.100-200cm.tif', 'PWP.15-30cm.tif', 'PWP.5-15cm.tif', 'theta_r_0-5cm.tif', 'theta_r_30-60cm.tif', 'theta_s_5-15cm.tif', 'theta_s_60-100cm.tif', 'alpha.100-200cm.tif', 'alpha.15-30cm.tif', 'alpha.30-60cm.tif', 'alpha.5-15cm.tif', 'alpha.60-100cm.tif', 'ksat.100-200cm.tif', 'ksat.30-60cm.tif', 'ksat.6

In [187]:
genetic_features

['PIRange_Bulkd.0-5cm.tif',
 'PIRange_Bulkd.100-200cm.tif',
 'PIRange_Bulkd.15-30cm.tif',
 'PIRange_Bulkd.30-60cm.tif',
 'PIRange_Bulkd.60-100cm.tif',
 'PIRange_Clay.0-5cm.tif',
 'PIRange_Clay.15-30cm.tif',
 'PIRange_Clay.30-60cm.tif',
 'PIRange_Sand.0-5cm.tif',
 'PIRange_Sand.100-200cm.tif',
 'PIRange_Sand.60-100cm.tif',
 'alpha_30-60cm.tif',
 'alpha_60-100cm.tif',
 'AvMoist.0-5cm.tif',
 'AWC_0-5cm.tif',
 'AWC_100-200cm.tif',
 'AWC_5-15cm.tif',
 'AWC_60-100cm.tif',
 'FC.0-5cm.tif',
 'FC.100-200cm.tif',
 'FC.15-30cm.tif',
 'FC.30-60cm.tif',
 'FC.60-100cm.tif',
 'ksat_15-30cm.tif',
 'ksat_30-60cm.tif',
 'ksat_60-100cm.tif',
 'n_15-30cm.tif',
 'n_5-15cm.tif',
 'PWP.100-200cm.tif',
 'PWP.15-30cm.tif',
 'PWP.5-15cm.tif',
 'theta_r_0-5cm.tif',
 'theta_r_30-60cm.tif',
 'theta_s_5-15cm.tif',
 'theta_s_60-100cm.tif',
 'alpha.100-200cm.tif',
 'alpha.15-30cm.tif',
 'alpha.30-60cm.tif',
 'alpha.5-15cm.tif',
 'alpha.60-100cm.tif',
 'ksat.100-200cm.tif',
 'ksat.30-60cm.tif',
 'ksat.60-100cm.tif',
 

In [201]:
cart_features = cart_feature_selection(df, 'Valor', n_features=5)
print("Características seleccionadas por CART:", cart_features)

Características seleccionadas por CART: ['PIRange_Bulkd.5-15cm.tif', 'PIRange_Clay.0-5cm.tif', 'ksat_30-60cm.tif', 'valor_humedad_suelo1', 'PIRange_Sand.5-15cm.tif']


In [202]:
new_cart_features = cart_features
new_cart_features.append('PP')
new_cart_features.append('slope')

In [203]:
cart_features

['PIRange_Bulkd.5-15cm.tif',
 'PIRange_Clay.0-5cm.tif',
 'ksat_30-60cm.tif',
 'valor_humedad_suelo1',
 'PIRange_Sand.5-15cm.tif',
 'PP',
 'slope']

In [204]:
X = df[cart_features]
y = df.Valor

In [205]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier

param_grid = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 1.0, 10.0]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(f'Numero de entrenamiendo: {X_train.shape}, Numero de test: {X_test.shape}')
model = XGBClassifier()
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=50, scoring='accuracy', cv=5, verbose=1, random_state=42)
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_true=y_test, y_pred=y_pred)
print(f'Precisión de {acc*100}')

Numero de entrenamiendo: (856, 7), Numero de test: (214, 7)
Fitting 5 folds for each of 50 candidates, totalling 250 fits
