In [75]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split,StratifiedKFold # Model evaluation
from sklearn.preprocessing import LabelEncoder, RobustScaler, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet,  LassoLarsIC, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone, BaseEstimator, TransformerMixin, RegressorMixin # Clone estimator
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import explained_variance_score, roc_auc_score, median_absolute_error, r2_score, mean_squared_error #To evaluate our model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 
from scipy.stats import norm, skew

In [91]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
testID = test.id
categorical_cols = train.select_dtypes("object").columns

In [92]:
y = train.target
train = train.drop(["target"],axis=1)
features = pd.concat([train,test])
features = features.drop(["id"],axis=1)

In [93]:
skew = False

In [94]:
def skew(features):
    from scipy.stats import norm, skew
    numeric_feats = features.dtypes[features.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_feats})


    skewness = skewness[abs(skewness) > 0.75]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

    from scipy.special import boxcox1p
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        features[feat] = boxcox1p(features[feat], lam)
    return features

if skew:
    features = skew(features)

There are 11 skewed numerical features to Box Cox transform


In [95]:
one_hot = True
label_encoder = False
overfit = False

In [96]:
if one_hot:
    features = pd.get_dummies(features)
    if overfit:
        x = features == 0
        x = (x.sum()/len(features)>.99).to_frame()
        drop_cols = x[x.iloc[:,0]].index
        features = features.drop(drop_cols,axis=1)

    
if label_encoder:
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for col in categorical_cols:
        features[col] = le.fit_transform(features[col])


In [97]:
train = features[:len(y)]
test = features[len(y):]

In [98]:
#logistic regression works like linear regression but shifts the line forward depending on where the switch is until it maximizes likelihood

pipelines = {
    "logistic_regression": Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ]),
     
    
    "xgb": Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)),
    ]),
    
    "xgb2": xgb.XGBClassifier(n_estimators=5,
    max_depth=10,
    learning_rate=0.5,
    seed=123)
    
    
}

In [99]:
params={'metric': 'auc', 'reg_alpha': 6.010538011450937, 'reg_lambda': 0.031702113663443346, 'colsample_bytree': 0.27,
   'subsample': 0.6, 'learning_rate': 0.05, 'max_depth': 100, 'num_leaves': 100, 'min_child_samples': 216,
   'cat_smooth': 87, 'random_state': 48,'n_estimators': 20000}
preds = np.zeros(test.shape[0])        
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True) #As we can see the data is unbalanced that's why I'll use StratifiedKFold to split data: Don't want all zeros in a split                 
auc=[]   # list contains AUC for each fold  
n=0   
for trn_idx, test_idx in kf.split(train,y):
    X_tr,X_val=train.iloc[trn_idx],train.iloc[test_idx]
    y_tr,y_val=y.iloc[trn_idx],y.iloc[test_idx]
    model = LGBMClassifier(**params) 
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False) 
    preds+=model.predict_proba(test)[:, 1]/kf.n_splits 
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])) 
    print(n+1,auc[n])                                                                                       
    n+=1    

1 0.8942635164251782
2 0.8960967723185278
3 0.894482208007798
4 0.8956907416756221
5 0.8965596568800058


In [74]:
sub = pd.DataFrame({"id":testID,"target":preds})
sub.to_csv("../output/LGB_sub2.csv",index=False)

In [76]:
k = pd.read_csv("../output/LGB_sub.csv")


0         0.115566
1         0.447129
2         0.010611
3         0.239745
4         0.101847
            ...   
199995    0.896518
199996    0.049339
199997    0.701568
199998    0.115219
199999    0.526594
Name: target, Length: 200000, dtype: float64