In [36]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split,StratifiedKFold # Model evaluation
from sklearn.preprocessing import LabelEncoder, RobustScaler, OneHotEncoder, StandardScaler # Preprocessing
from sklearn.linear_model import Lasso, Ridge, ElasticNet,  LassoLarsIC, RANSACRegressor, SGDRegressor, HuberRegressor, BayesianRidge # Linear models
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  # Ensemble methods
from xgboost import XGBRegressor, plot_importance # XGBoost
from sklearn.svm import SVR, SVC, LinearSVC  # Support Vector Regression
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline # Streaming pipelines
from sklearn.decomposition import KernelPCA, PCA # Dimensionality reduction
from sklearn.feature_selection import SelectFromModel # Dimensionality reduction
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.base import clone, BaseEstimator, TransformerMixin, RegressorMixin # Clone estimator
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import explained_variance_score, roc_auc_score, median_absolute_error, r2_score, mean_squared_error #To evaluate our model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split 
from scipy.stats import norm, skew

In [37]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
testID = test.id
categorical_cols = train.select_dtypes("object").columns
y = train.target
features = pd.concat([train,test])

In [38]:
skew = False

In [39]:
def skew(features):
    from scipy.stats import norm, skew
    numeric_feats = features.dtypes[features.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = features[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_feats})


    skewness = skewness[abs(skewness) > 0.75]
    print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

    from scipy.special import boxcox1p
    skewed_features = skewness.index
    lam = 0.15
    for feat in skewed_features:
        features[feat] = boxcox1p(features[feat], lam)
    return features

if skew:
    features = skew(features)

There are 13 skewed numerical features to Box Cox transform


In [44]:
class spearman_corr:
    def __init__(self,train,test,features):
        self.features = features
        self.train = train
        self.test = test
        self.categorical_cols = self.test.select_dtypes("object").columns
        self.numerical_cols = self.test.select_dtypes(exclude="object").columns
    def make_key(self):
        self.key = pd.DataFrame(columns=["index"], index=pd.MultiIndex.from_tuples([], names=['variable','value']))

        for x in self.categorical_cols:
            kf = self.train.groupby(x)["target"].mean().to_frame().sort_values("target")
            kf = kf.reset_index().reset_index().drop(columns=["target"])
            kf["variable"] = x
            kf.index = kf.index + 1
            self.key = pd.concat([self.key, kf.rename(columns={x:"value"}).set_index(["variable","value"])])

    def encode(self):
        reshape_df = pd.melt(self.features, id_vars="id", value_vars=self.categorical_cols).merge(self.key, on=["variable","value"])
        qualDf= reshape_df.pivot(index='id', columns='variable')["index"]
        self.features = qualDf.merge(self.features[self.numerical_cols], on=["id"])
        
  
        self.new_train = self.features[:len(train)]
        self.new_test = self.features[len(train):]
    def get_key(self):
        return self.key
    def get_train(self):
        return self.new_train
    def get_test(self):
        return self.new_test
    def get_features(self):
        return self.features
    def get_y(self):
        return self.train.target

In [45]:
one_hot = False
label_encoder = False
overfit = False
spearman = True

In [46]:
if spearman:
    spearman = spearman_corr(train,test,features)
    spearman.make_key()
    spearman.encode()
    features = spearman.get_features()
    features[features.select_dtypes("object").columns] = features.select_dtypes("object").fillna(0).astype("int")
    
train = train.drop(["target"],axis=1)  
if one_hot:
    features = pd.get_dummies(features)
    if overfit:
        x = features == 0
        x = (x.sum()/len(features)>.99).to_frame()
        drop_cols = x[x.iloc[:,0]].index
        features = features.drop(drop_cols,axis=1)

    
if label_encoder:
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for col in categorical_cols:
        features[col] = le.fit_transform(features[col])


In [64]:
features = features.drop(["id"],axis=1)
train = features[:len(y)]
test = features[len(y):]

In [48]:
#logistic regression works like linear regression but shifts the line forward depending on where the switch is until it maximizes likelihood

pipelines = {
    "logistic_regression": Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', LogisticRegression()),
    ]),
     
    
    "xgb": Pipeline([
    ('Scaler', StandardScaler()),
    ('classifier', XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)),
    ]),
    
    "xgb2": xgb.XGBClassifier(n_estimators=5,
    max_depth=10,
    learning_rate=0.5,
    seed=123)
    
    
}

In [49]:
params={'metric': 'auc', 'reg_alpha': 6.010538011450937, 'reg_lambda': 0.031702113663443346, 'colsample_bytree': 0.27,
   'subsample': 0.6, 'learning_rate': 0.05, 'max_depth': 100, 'num_leaves': 100, 'min_child_samples': 216,
   'cat_smooth': 87, 'random_state': 48,'n_estimators': 20000}
preds = np.zeros(test.shape[0])        
kf = StratifiedKFold(n_splits=5,random_state=48,shuffle=True) #As we can see the data is unbalanced that's why I'll use StratifiedKFold to split data: Don't want all zeros in a split                 
auc=[]   # list contains AUC for each fold  
n=0   
for train_idx, test_idx in kf.split(train,y):
    X_train,X_val=train.iloc[train_idx],train.iloc[test_idx]
    y_train,y_val=y.iloc[train_idx],y.iloc[test_idx]
    model = LGBMClassifier(**params) 
    model.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False) 
    preds+=model.predict_proba(test)[:, 1]/kf.n_splits 
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])) 
    print(n+1,auc[n])                                                                                       
    n+=1    
    

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: cat0, cat1, cat10, cat11, cat12, cat13, cat14, cat15, cat16, cat17, cat18, cat2, cat3, cat4, cat5, cat6, cat7, cat8, cat9

In [19]:
sub = pd.DataFrame({"id":testID,"target":preds})
sub.to_csv("../output/LGBM_labelEncoded_sub2.csv.csv",index=False)

In [76]:
k = pd.read_csv("../output/LGB_sub.csv")


0         0.115566
1         0.447129
2         0.010611
3         0.239745
4         0.101847
            ...   
199995    0.896518
199996    0.049339
199997    0.701568
199998    0.115219
199999    0.526594
Name: target, Length: 200000, dtype: float64