In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings("ignore")

In [59]:
DATA_PATH = './data/'
ORIGINAL_DATA_PATH = DATA_PATH
SUBMISSIONS_PATH = './submissions/'
MODELS_PATH = './trained_models/'
TEMP_PATH = './temp/'

# Load the data
train = pd.read_csv(DATA_PATH + 'train.csv', index_col='id')
test = pd.read_csv(DATA_PATH + 'test.csv', index_col='id')
original = pd.read_csv(ORIGINAL_DATA_PATH + 'flood.csv')
original.index.rename('id', inplace=True)
new_train = pd.concat([train, original], axis=0)

In [60]:
from scipy.stats import hmean, gmean
from sklearn.preprocessing import PolynomialFeatures

NON_FEATURES = ['id', 'FloodProbability', 'fold']
BASE_FEATURES = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

def add_features(df):
    df['total'] = df[BASE_FEATURES].sum(axis=1)
    df['amplified_sum'] = (df[BASE_FEATURES] ** 1.5).sum(axis=1)
    df['fskew'] = df[BASE_FEATURES].skew(axis=1)
    df['fkurtosis'] = df[BASE_FEATURES].kurtosis(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    df['max'] = df[BASE_FEATURES].max(axis=1)
    df['min'] = df[BASE_FEATURES].min(axis=1)
    df['range'] = df['max'] - df['min']
    df['median'] = df[BASE_FEATURES].median(axis=1)
    df['ptp'] = df[BASE_FEATURES].values.ptp(axis=1)
    df['q25'] = df[BASE_FEATURES].quantile(0.25, axis=1)
    df['q75'] = df[BASE_FEATURES].quantile(0.75, axis=1)
    
    
    return df

train = add_features(train)
FEATURES = [col for col in train.columns if col not in NON_FEATURES]
train = train[FEATURES + ['FloodProbability']]
test = add_features(test)

In [61]:
# print correlation sorted by target
correlation = train.corr()
correlation = correlation['FloodProbability'].sort_values(ascending=False)

print(correlation)

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

In [None]:
import torch 
import torch.nn as nn

X = train[FEATURES].values
y = train['FloodProbability'].values

X = torch.tensor(X, dtype=torch.float32)#.to('cuda')
Q = 25
U,S,V = torch.pca_lowrank(X, q=Q)

In [None]:
# reconstuct the dataframe 
U = U.detach().numpy()
X_pca = pd.DataFrame(U, columns=[f'pca_{i}' for i in range(Q)])

In [None]:
# split the data to train and validation
from sklearn.model_selection import train_test_split
X_train_pca, X_val_pca, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# define the model
lgbm_params = {
    'boosting_type': 'gbdt', 
    'num_leaves': 227, 
    'subsample_for_bin': 204195, 
    'min_child_samples': 98, 
    'max_depth': 14, 
    'learning_rate': 0.008725580097840743, 
    'n_estimators': 1486, 
    'subsample': 0.6924206162743796, 
    'colsample_bytree': 0.608985636026134, 
    'reg_alpha': 0.000982304606619489, 
    'reg_lambda': 4.733782716082672
    }

model = LGBMRegressor(device='cuda', verbose=-1)

model.fit(X_train_pca, y_train)
pca_preds  = model.predict(X_val_pca)
print('pca preds:', r2_score(y_val, pca_preds))

model.fit(X_train, y_train)
preds = model.predict(X_val)
print('original preds:', r2_score(y_val, preds))


pca preds: 0.8642885617977527
original preds: 0.8684897199424749


In [None]:
# # transform the test data
# X_test = test[FEATURES].values
# X_test = torch.tensor(X_test, dtype=torch.float32)
# # centerr the data 
# X_test = X_test - X.mean(0)
# # apply the pca
# X_test_pca = torch.mm(X_test, V)

# model.fit(X_pca, y)
# preds = model.predict(X_test_pca)

# submission = pd.DataFrame({'id': test.index, 'FloodProbability': preds})
# submission.to_csv(SUBMISSIONS_PATH + 'lgbm_pca.csv', index=False)