# Simulate Data

In [153]:
import numpy as np
from numpy import random as rd
import pandas as pd

In [155]:
# define X variables, 100,000 observations
rd.seed(22)
num_obs = 100000

X1 = rd.randn(num_obs)*2.0
X2 = rd.random(num_obs)*10+4
X3 = rd.choice([0,1], num_obs)
X4 = rd.randint(0,101,num_obs)
X5 = rd.rand(num_obs)*11-5
X6 = rd.lognormal(0,0.03,num_obs)
X7 = rd.choice(['male', 'female'], num_obs)





# define Y variable, 70% zero, 30% one
Y = np.floor(rd.rand(num_obs)/0.7)

df = pd.DataFrame(np.array([X1,X2,X3,X4,X5,X6,X7,Y]).transpose(),
                  columns=['X1','X2','X3','X4','X5','X6','X7','Y'])

df = df.astype(dtype= {"X1":"float64", "X2":"float64", "X3":"float64", "X4":"int32",
                      "X5":"float64", "X6":"float64", "Y":"float64"})

# missing values
df["X1"][(df["X4"] > 1) & (df["X4"] < 10)] = np.NAN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["X1"][(df["X4"] > 1) & (df["X4"] < 10)] = np.NAN


# Data Check


In [154]:
df.shape

(100000, 8)

In [152]:
df.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,Y
0,,4.679251,0.0,9,-3.309713,0.996256,female,0.0
1,-2.926701,4.88184,1.0,27,-0.1312,1.002022,female,0.0
2,2.163583,8.385173,0.0,37,-2.130169,0.99291,female,1.0
3,-0.47865,9.881707,0.0,38,-3.724918,1.030024,male,0.0
4,-0.982258,8.630986,1.0,81,3.550159,0.990598,female,1.0
5,-2.004544,5.862744,0.0,21,2.54084,1.004915,female,0.0
6,1.837643,7.147812,1.0,98,-3.784828,0.968413,female,0.0
7,-2.207264,13.303804,0.0,42,-2.72201,1.030733,male,0.0
8,1.252987,13.513119,0.0,20,4.671809,0.968756,female,0.0
9,-1.123028,9.983652,1.0,31,0.557406,0.970501,male,1.0


In [156]:
np.transpose(df.describe())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X1,92017.0,0.004896,2.001397,-10.126068,-1.347457,0.008635,1.364258,8.617236
X2,100000.0,9.007588,2.891063,4.000178,6.506862,9.012539,11.510775,13.999772
X3,100000.0,0.49743,0.499996,0.0,0.0,0.0,1.0,1.0
X4,100000.0,50.00276,29.149843,0.0,25.0,50.0,75.0,100.0
X5,100000.0,0.508507,3.174391,-4.999733,-2.241846,0.515214,3.251121,5.999996
X6,100000.0,1.000417,0.029994,0.881713,0.980033,0.999927,1.020329,1.141803
Y,100000.0,0.30028,0.458382,0.0,0.0,0.0,1.0,1.0


In [157]:
df.dtypes

X1    float64
X2    float64
X3    float64
X4      int32
X5    float64
X6    float64
X7     object
Y     float64
dtype: object

# Data preparation (ensemble version)

1. Missing imputation
2. Encoding
3. Make sure all attributes are numeric (sklearn)


In [89]:
df = df.fillna(-999999)

In [158]:
gender = pd.get_dummies(df.X7)
gender.drop(["male"], axis =1)
df.X7 = gender.female

In [104]:
df.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,Y
0,-999999.0,4.679251,0.0,9,-3.309713,0.996256,1,0.0
1,-2.926701,4.88184,1.0,27,-0.1312,1.002022,1,0.0
2,2.163583,8.385173,0.0,37,-2.130169,0.99291,1,1.0
3,-0.47865,9.881707,0.0,38,-3.724918,1.030024,0,0.0
4,-0.982258,8.630986,1.0,81,3.550159,0.990598,1,1.0
5,-2.004544,5.862744,0.0,21,2.54084,1.004915,1,0.0
6,1.837643,7.147812,1.0,98,-3.784828,0.968413,1,0.0
7,-2.207264,13.303804,0.0,42,-2.72201,1.030733,0,0.0
8,1.252987,13.513119,0.0,20,4.671809,0.968756,1,0.0
9,-1.123028,9.983652,1.0,31,0.557406,0.970501,0,1.0


# Sampling

In [105]:
from sklearn.model_selection import train_test_split

In [107]:
X = df.drop(["Y"], axis =1)
Y = df.Y
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Train model

In [159]:
from sklearn.ensemble import GradientBoostingClassifier

In [140]:
for trees in [20, 100, 200, 500]:
    for learnin_rate in [0.1, 0.2, 0.5]:
        gb_clf = GradientBoostingClassifier(n_estimators=trees, learning_rate=learnin_rate, 
                                    max_depth=4, random_state=0)
        DFDF
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(max_depth=4, n_estimators=200, random_state=0)

In [149]:
predicted_values = gb_clf.predict_proba(X_test)[:,1]

In [150]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predicted_values)

0.5021794239716677