In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import Lasso
import seaborn as sns
import sys
import warnings
from scipy.stats import skew , kurtosis
from sklearn.cluster import FeatureAgglomeration
from sklearn.model_selection import train_test_split
import lightgbm as lgb
if not sys.warnoptions:
    warnings.simplefilter("ignore")
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

### Loading train and test data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train['log_target'] = np.log1p(train.target)
print(train.shape, test.shape)

In [None]:
test.drop("ID", axis = 1, inplace = True)
y= train.log_target.values
X_cols = [col for col in train.columns if col not in ['ID','target','log_target']]

### Converting features into log transformation

In [None]:
feats_to_convert = []
for col in X_cols:
    diff = train[col].max() - train[col].min()
    if diff>1000:
        feats_to_convert.append(col)
print(len(feats_to_convert))

train[feats_to_convert] = np.log1p(train[feats_to_convert].values)
test[feats_to_convert] = np.log1p(test[feats_to_convert].values)

### Lasso Feature Selection 

In [None]:
model = Lasso(alpha=0.031,max_iter=1000, fit_intercept=True, normalize=False, random_state=42)
model.fit(X=train[X_cols].values, y=train.log_target.values)
imp_feats_indexes = np.nonzero(model.coef_)[0]
imp_feats = np.array(X_cols)[imp_feats_indexes]
print('Number of important features selected by lasso:', len(imp_feats))

### Add Statistical Features

In [None]:
d= train[X_cols]
TF = train[imp_feats]
TFT = test[imp_feats]

tmp_train = d[d!=0]
tmp_test = test[test!=0]

TF["sum"] = train[X_cols].sum(axis=1)
TFT["sum"] = test.sum(axis=1)

TF["var"] = tmp_train.var(axis=1)
TFT["var"] = tmp_test.var(axis=1)

TF["mean"] = tmp_train.mean(axis=1)
TFT["mean"] = tmp_test.mean(axis=1)

TF["std"] = tmp_train.std(axis=1)
TFT["std"] = tmp_test.std(axis=1)

TF["skew"] = tmp_train.skew(axis=1)
TFT["skew"] = tmp_test.skew(axis=1)

TF["kurtosis"] = tmp_train.kurtosis(axis=1)
TFT["kurtosis"] = tmp_test.kurtosis(axis=1)

### Feature Agglomeration

In [None]:
agglo = FeatureAgglomeration(n_clusters=60)
agglo.fit(d,y)

In [None]:
#Transforming the data with merged features
X_new = agglo.transform(d)
X_TNEW = agglo.transform(test)

In [None]:
# Naming the columns
cols = [] 
for i in range(1,61):
    c ='feat_agg'+str(i)
    i+=1
    cols.append(c)

In [None]:
agglo_df_train = pd.DataFrame(X_new,columns=cols) 
agglo_df_test = pd.DataFrame(X_TNEW,columns=cols)

In [None]:
Comb_feats = pd.concat([TF,agglo_df_train],axis=1) # combined fetures
Comb_feats.shape

In [None]:
Comb_feats_test = pd.concat([TFT,agglo_df_test], axis=1)
Comb_feats_test.shape

## Modelling

In [None]:
dev_X, val_X, dev_y, val_y = train_test_split(Comb_feats, y, test_size = 0.25, random_state = 42)

In [None]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 80,
        "max_depth" : 7,
#         "learning_rate" : 0.004,
        "bagging_fraction" : 0.5,
        "feature_fraction" : 0.7,
        "bagging_frequency" : 6,
        "max_bin": 130,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42,
        "lambda_l1":
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=100, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [None]:
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, Comb_feats_test)
print("LightGBM Training Completed...")

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test
sub_lgb["ID"] = sub["ID"]
sub_lgb.to_csv("sub_lgb.csv", index=False)
print(pred_test)
