## Shortlist Promising Models

#### 0. Import modules and define parameters

In [1]:
from module.utils import general_utils
from module.utils import data_prepare_utils
from module.utils import model_performance_utils

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
train_df_file = "data/train_df.csv"

target_attr = "label"
target_type = "binary"

estimator_name = "SGDClassifier"

sample_random_state = 24
target_encoding_random_state = 42
model_random_state = 42
sgd_max_iter = 10000
permutation_importance_random_state = 0

#### 1. Import Data

In [3]:
train_df = general_utils.read_csv(train_df_file)


Read CSV file data/train_df.csv into DataFrame:
df.head(): 


Unnamed: 0,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,tags,...,device_price,up_life_duration,up_membership_grade,membership_life_duration,consume_purchase,communication_onlinerate,communication_avgonline_30d,indu_name,pt_d,label
0,1641431,5177,1998,7,191,60,5,21,82,14,...,2,-1,-1,-1,2,-1,6,24,3,0
1,2021896,4628,4530,7,177,56,5,17,31,40,...,3,18,-1,-1,2,10^11^12^13^14^15^16^17^18^19^20^21^22^23,7,17,3,0
2,1790795,2709,1413,7,134,55,4,17,65,18,...,5,20,-1,-1,5,0^1^2^3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18...,13,14,1,0
3,1216709,1949,6143,7,150,17,5,21,11,39,...,2,16,-1,-1,2,8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,10,36,3,0
4,1635521,4806,2176,7,206,64,5,15,22,39,...,4,20,-1,-1,2,6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,10,36,1,0


df.shape: (838142, 36)


#### 2. Sample smaller training sets for fast exploration

In [4]:
train_df_sample = data_prepare_utils.sample_data(train_df, 0.01, random_state=sample_random_state)


Sample 0.01 fraction from DataFrame:
sample_df.shape: (8381, 36)


#### 2. Check attribute types

In [5]:
numerical_attr_list = []
categorical_attr_list = [attr for attr in train_df_sample.columns if attr not in numerical_attr_list and attr != target_attr]
categorical_attr_list

['uid',
 'task_id',
 'adv_id',
 'creat_type_cd',
 'adv_prim_id',
 'dev_id',
 'inter_type_cd',
 'slot_id',
 'spread_app_id',
 'tags',
 'app_first_class',
 'app_second_class',
 'age',
 'city',
 'city_rank',
 'device_name',
 'device_size',
 'career',
 'gender',
 'net_type',
 'residence',
 'his_app_size',
 'his_on_shelf_time',
 'app_score',
 'emui_dev',
 'list_time',
 'device_price',
 'up_life_duration',
 'up_membership_grade',
 'membership_life_duration',
 'consume_purchase',
 'communication_onlinerate',
 'communication_avgonline_30d',
 'indu_name',
 'pt_d']

#### 2. Predefined data processing strategy in prep.ipynb

In [6]:
train_df_sample = data_prepare_utils.drop_duplicate_obs(train_df_sample)


Drop duplicate observations:
df.shape:  (8381, 36)
drop_dup_df.shape:  (8380, 36)


In [7]:
attrs_to_drop = ['app_score', 'his_on_shelf_time', 'task_id', 'spread_app_id', 'tags', 'dev_id', 'app_second_class', 'adv_prim_id', 'device_price']

numerical_transformer = Pipeline([
    ("column_dropper", data_prepare_utils.DropColumnsTransformer(attrs_to_drop)),
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("column_dropper", data_prepare_utils.DropColumnsTransformer(attrs_to_drop)),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("target_encoder", TargetEncoder(target_type=target_type, random_state=target_encoding_random_state)),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("numerical", numerical_transformer, numerical_attr_list),
    ("categorical", categorical_transformer, categorical_attr_list)
])

preprocessor

#### 3. Define default models

In [8]:
estimator_dict = {
    "SGDClassifier": SGDClassifier(
        loss="log_loss", 
        random_state=model_random_state, 
        class_weight="balanced",
        max_iter=sgd_max_iter
    ),

    "DecisionTreeClassifier": DecisionTreeClassifier(
        criterion='log_loss', 
        random_state=model_random_state, 
        class_weight="balanced"
    ),
    
    "RandomForestClassifier": RandomForestClassifier(
        criterion="log_loss", 
        random_state=model_random_state, 
        class_weight="balanced_subsample"
    ),
    
    "AdaBoostClassifier": AdaBoostClassifier(
        estimator=DecisionTreeClassifier(
            criterion='log_loss', 
            random_state=model_random_state, 
            class_weight='balanced',
            max_depth=1
        ),
        random_state=model_random_state
    ),

    "GradientBoostingClassifier": GradientBoostingClassifier(
        loss='log_loss', 
        random_state=model_random_state
    )
}

#### 4. Define composite model

In [9]:
composite_estimator = Pipeline([
    ("preprocessor", preprocessor),
    (estimator_name, estimator_dict[estimator_name])
])
composite_estimator

In [10]:
cap_x_df, y_df = train_df_sample.drop(columns=target_attr), train_df_sample[[target_attr]]
del train_df_sample

#### 5. Performance using N-fold cross validation

In [11]:
model_performance_utils.check_out_cross_val_score(composite_estimator, cap_x_df, y_df)

Cross Validation Scores: [0.56682578 0.40393795 0.42124105 0.54295943 0.42303103]
mean: 0.47159904534606206
standard deviation: 0.07686552964339113


#### 6. Check the most significant attributes

In [12]:
composite_estimator.fit(cap_x_df, y_df.values.ravel())

model_performance_utils.check_out_permutation_importance(
    composite_estimator, 
    cap_x_df, 
    y_df, 
    permutation_importance_random_state
)


Permutation importance:

metric: sqrt_neg_mean_squared_error
    uid      0.172 +/- 0.031
    app_first_class 0.103 +/- 0.027
    his_app_size 0.101 +/- 0.047
    city_rank 0.068 +/- 0.042
    net_type 0.060 +/- 0.030
    gender   0.059 +/- 0.022
    list_time 0.056 +/- 0.032
    age      0.044 +/- 0.026
    membership_life_duration 0.025 +/- 0.010


#### 7. Analyze error types

In [13]:
model_performance_utils.check_out_error_types(composite_estimator, cap_x_df, y_df)


Check out type I and type II errors
type I error (false positive): 3002
type II error (false negative): 1


#### 8. Feature selection and engineering

#### 9. Fit and evaluate again

#### 10. Check the most promising models