In [1]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split,StratifiedKFold # Model evaluation
from sklearn.preprocessing import LabelEncoder, RobustScaler, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet,  LassoLarsIC, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone, BaseEstimator, TransformerMixin, RegressorMixin # Clone estimator
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import explained_variance_score, roc_auc_score, median_absolute_error, r2_score, mean_squared_error #To evaluate our model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 

In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
testID = test.id
categorical_cols = train.select_dtypes("object").columns

In [3]:
y = train.target
train = train.drop(["target"],axis=1)
features = pd.concat([train,test])
features = features.drop(["id"],axis=1)

In [4]:
from scipy.stats import norm, skew
numeric_feats = features.dtypes[features.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})


skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    features[feat] = boxcox1p(features[feat], lam)

There are 11 skewed numerical features to Box Cox transform


In [5]:
one_hot = True
label_encoder = False

In [6]:
if one_hot:
    features = pd.get_dummies(features)
    
    
    
if label_encoder:
    train = pd.read_csv("../input/train.csv")
    test = pd.read_csv("../input/test.csv")
    testID = test.id


    all_df = pd.concat([train , test]).reset_index(drop = True)

    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for col in categorical_cols:
        all_df[col] = le.fit_transform(all_df[col])

    train = all_df[:train.shape[0]]
    test = all_df[train.shape[0]:].reset_index(drop = True)

In [7]:
x = features == 0
x = (x.sum()/len(features)>.99).to_frame()
drop_cols = x[x.iloc[:,0]].index
features = features.drop(drop_cols,axis=1)

In [8]:
train = features[:len(y)]
test = features[len(y):]

In [9]:
#logistic regression works like linear regression but shifts the line forward depending on where the switch is until it maximizes likelihood

pipelines = {
    "logistic_regression": Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ]),
     
    
    "xgb": Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)),
    ]),
    
    "xgb2": xgb.XGBClassifier(n_estimators=5,
    max_depth=10,
    learning_rate=0.5,
    seed=123)
    
    
}

In [156]:
clf = pipelines["logistic_regression"]
clf.fit(train,y)
clf.score(train,y)

In [None]:
seed = 5
n_folds =5
scoring='auc'
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
np.sqrt(-cross_val_score(clf, train, y, cv= kfold,
                                 scoring=scoring, n_jobs=1))







In [13]:
params={'metric': 'auc', 'reg_alpha': 6.010538011450937, 'reg_lambda': 0.031702113663443346, 'colsample_bytree': 0.27,
   'subsample': 0.6, 'learning_rate': 0.05, 'max_depth': 100, 'num_leaves': 100, 'min_child_samples': 216,
   'cat_smooth': 87, 'random_state': 48,'n_estimators': 100}
preds = np.zeros(test.shape[0])        
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True) #As we can see the data is unbalanced that's why I'll use StratifiedKFold to split data: Don't want all zeros in a split                 
auc=[]   # list contains AUC for each fold  
n=0   
for trainVals, testVals in kf.split(train,y):
    train_split = train.iloc[trainVals]
    y_split = y.iloc[trainVals]
    val_split = train.iloc[testVals]
    y_val_split = y.iloc[testVals]
    model = LGBMClassifier(**params) 
    model.fit(train_split,y_split, eval_set=[(val_split,y_val_split)], early_stopping_rounds=100)
    score = roc_auc_score(y_val_split, model.predict_proba(val_split)[:, 1])
    print(score)
    preds += model.predict_proba(test)[:, 1]/kf.n_splits 
    

[1]	valid_0's auc: 0.844622
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.854938
[3]	valid_0's auc: 0.851165
[4]	valid_0's auc: 0.864183
[5]	valid_0's auc: 0.869804
[6]	valid_0's auc: 0.870141
[7]	valid_0's auc: 0.871167
[8]	valid_0's auc: 0.871555
[9]	valid_0's auc: 0.872293
[10]	valid_0's auc: 0.875212
[11]	valid_0's auc: 0.876374
[12]	valid_0's auc: 0.877178
[13]	valid_0's auc: 0.877234
[14]	valid_0's auc: 0.877757
[15]	valid_0's auc: 0.878176
[16]	valid_0's auc: 0.878569
[17]	valid_0's auc: 0.878475
[18]	valid_0's auc: 0.878429
[19]	valid_0's auc: 0.878268
[20]	valid_0's auc: 0.878822
[21]	valid_0's auc: 0.879371
[22]	valid_0's auc: 0.879681
[23]	valid_0's auc: 0.879984
[24]	valid_0's auc: 0.879871
[25]	valid_0's auc: 0.880196
[26]	valid_0's auc: 0.880641
[27]	valid_0's auc: 0.880802
[28]	valid_0's auc: 0.880971
[29]	valid_0's auc: 0.881172
[30]	valid_0's auc: 0.881239
[31]	valid_0's auc: 0.881453
[32]	valid_0's auc: 0.881777
[33]	valid_0's auc:

[77]	valid_0's auc: 0.886581
[78]	valid_0's auc: 0.886639
[79]	valid_0's auc: 0.886665
[80]	valid_0's auc: 0.88677
[81]	valid_0's auc: 0.886823
[82]	valid_0's auc: 0.886906
[83]	valid_0's auc: 0.88702
[84]	valid_0's auc: 0.887041
[85]	valid_0's auc: 0.887092
[86]	valid_0's auc: 0.887152
[87]	valid_0's auc: 0.8872
[88]	valid_0's auc: 0.887275
[89]	valid_0's auc: 0.887394
[90]	valid_0's auc: 0.887464
[91]	valid_0's auc: 0.887514
[92]	valid_0's auc: 0.887555
[93]	valid_0's auc: 0.887594
[94]	valid_0's auc: 0.887672
[95]	valid_0's auc: 0.88771
[96]	valid_0's auc: 0.887785
[97]	valid_0's auc: 0.887836
[98]	valid_0's auc: 0.887888
[99]	valid_0's auc: 0.887943
[100]	valid_0's auc: 0.88801
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.88801
0.8880096690111776
[1]	valid_0's auc: 0.846649
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.85768
[3]	valid_0's auc: 0.854405
[4]	valid_0's auc: 0.867036
[5]	valid_0's auc: 0.871789
[6]	valid_0'

In [132]:
preds = clf.predict(test)

In [74]:
sub = pd.DataFrame({"id":testID,"target":preds})
sub.to_csv("../output/LGB_sub2.csv",index=False)

In [76]:
k = pd.read_csv("../output/LGB_sub.csv")


0         0.115566
1         0.447129
2         0.010611
3         0.239745
4         0.101847
            ...   
199995    0.896518
199996    0.049339
199997    0.701568
199998    0.115219
199999    0.526594
Name: target, Length: 200000, dtype: float64