# Mounting GDrive directory

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/Colab Notebooks/UoA_MSDS/Course_8/Capstone1_Ad_Campaign_Recommender/'

Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/UoA_MSDS/Course_8/Capstone1_Ad_Campaign_Recommender


# Import libraries

In [2]:
import os
import csv

from IPython.core.display import display, HTML

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



from sklearn import config_context
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn import model_selection
from sklearn.model_selection import RandomizedSearchCV

# Reading data

In [3]:
# Load into dataframes, disble quote parsing inside strings when loading
# Ref: https://stackoverflow.com/a/29857126

# Scenario 1: devices with event data
segment_1_train = pd.read_csv('data/train/segment_1_train.csv')
segment_1_test = pd.read_csv('data/test/segment_1_test.csv')
# Scenario 2: devices without event data
segment_2_train = pd.read_csv('data/train/segment_2_train.csv')
segment_2_test = pd.read_csv('data/test/segment_2_test.csv')

# Subtask 5: Model building - Different models (Stacking)

## 1. Common functions for training models

In [4]:
from types import SimpleNamespace

# Generate stacking models
def gen_stacking_models():
    # Classifiers, using One-Vs-Rest strategy if target is multiclass problem
    clf1 = LogisticRegression(multi_class='ovr', random_state=0)
    clf2 = RandomForestClassifier(n_estimators=25, random_state=0) # Use small base models
    xgb = XGBClassifier(random_state=0)

    # Stacking model
    sclf = StackingClassifier(
        estimators=[
            ('LogisticRegression', clf1),
            ('RandomForest', clf2),
        ],
        final_estimator=xgb,
        stack_method='predict_proba',
        passthrough=True,
        cv=5,
        n_jobs=-1,
    )
    return SimpleNamespace(
        clf1=clf1,
        clf2=clf2,
        xgb=xgb,
        sclf=sclf,
    )

In [5]:
# Do CV to check performance of stacking model vs base models
def cross_val_check(stacking_models, X_train, y_train):
    for clf, label in zip(
        [
            stacking_models.clf1,
            stacking_models.clf2,
            stacking_models.sclf,
        ],
        [
            'LogisticRegression',
            'RandomForest',
            'StackingClassifier',
        ]
    ):
        scores = model_selection.cross_val_score(
            clf,
            X_train,
            y_train,
            cv=3, # Just do only 3 folds as we are limited in power
            scoring='roc_auc_ovr', # Scoring for both single-class and multi-class problem
        )
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [6]:
# Call `get_params()` on stacking classifier object to see the params that we can tune
gen_stacking_models().sclf.get_params()

{'cv': 5,
 'estimators': [('LogisticRegression',
   LogisticRegression(multi_class='ovr', random_state=0)),
  ('RandomForest', RandomForestClassifier(n_estimators=25, random_state=0))],
 'final_estimator__objective': 'binary:logistic',
 'final_estimator__base_score': None,
 'final_estimator__booster': None,
 'final_estimator__callbacks': None,
 'final_estimator__colsample_bylevel': None,
 'final_estimator__colsample_bynode': None,
 'final_estimator__colsample_bytree': None,
 'final_estimator__device': None,
 'final_estimator__early_stopping_rounds': None,
 'final_estimator__enable_categorical': False,
 'final_estimator__eval_metric': None,
 'final_estimator__feature_types': None,
 'final_estimator__gamma': None,
 'final_estimator__grow_policy': None,
 'final_estimator__importance_type': None,
 'final_estimator__interaction_constraints': None,
 'final_estimator__learning_rate': None,
 'final_estimator__max_bin': None,
 'final_estimator__max_cat_threshold': None,
 'final_estimator__max_c

In [7]:
# Tune stacking models
def tune_stacking_models(
    stacking_models,
    X_train,
    y_train,
    cv=3, # By default, just do only 3 folds as we are limited in power
    n_iter=10, # For quick result, just iterate 10 combinations
):
    params = {
        # Find best params for RandomForest
        'RandomForest__max_depth': range(10, 100, 10),
        'RandomForest__min_samples_leaf': range(1, 8, 3),
        'RandomForest__min_samples_split': range(2, 10, 2),
        'RandomForest__n_estimators': range(25, 150, 25),
        # Find best params for XGBoost
        'final_estimator__min_child_weight': [1, 5, 10],
        'final_estimator__gamma': [0.5, 1, 1.5, 2, 5],
        'final_estimator__subsample': [0.6, 0.8, 1.0],
        'final_estimator__colsample_bytree': [0.6, 0.8, 1.0],
        'final_estimator__max_depth': [3, 4, 5],
        'final_estimator__n_estimators': range(60, 360, 40),
        'final_estimator__learning_rate': [0.1, 0.01, 0.05]
    }

    # Randomly search 50 models as runtime of Google Colab is limited for free accounts
    # Should run grid search on powerful resource with longer runtime
    searchCV = RandomizedSearchCV(
        estimator=stacking_models.sclf,
        param_distributions=params,
        scoring='roc_auc_ovr', # Scoring for both single-class and multi-class problem
        cv=cv,
        n_iter=n_iter,
        refit=True,
        random_state=0,
        verbose=1,
    )
    searchCV.fit(X_train, y_train)

    return searchCV

In [8]:
def print_search_stats(searchCV):
    cv_keys = ('mean_test_score', 'std_test_score', 'params')

    for r, _ in enumerate(searchCV.cv_results_['mean_test_score']):
        print("%0.3f +/- %0.2f %r"
            % (searchCV.cv_results_[cv_keys[0]][r],
                searchCV.cv_results_[cv_keys[1]][r] / 2.0,
                searchCV.cv_results_[cv_keys[2]][r]))

    print('Best parameters: %s' % searchCV.best_params_)
    print('Accuracy: %.2f' % searchCV.best_score_)

## 2. Gender prediction model

### a) Scenario 1: Devices with event data

In [9]:
# Format train/test data
X_train = segment_1_train.drop(columns=['device_id', 'gender', 'age_group'])
y_train = segment_1_train['gender']

X_test = segment_1_test.drop(columns=['device_id', 'gender', 'age_group'])
y_test = segment_1_test[['device_id', 'gender']]

for df, label in zip(
    [
        X_train,
        y_train,
        X_test,
        y_test,
    ],
    [
        'X_train',
        'y_train',
        'X_test',
        'y_test',
    ],
):
    display(HTML(f'<h2>{label}</h2>'))
    display(df)

Unnamed: 0,average_daily_events,location_cluster_-1,location_cluster_0,location_cluster_1,location_cluster_2,location_cluster_3,location_cluster_4,location_cluster_5,location_cluster_6,location_cluster_7,...,app_categories_quality,app_categories_reading platform,app_categories_realistic style comic,app_categories_service,app_categories_show,app_categories_takeaway ordering,app_categories_tourism product,app_categories_travel,app_categories_video,app_categories_zombies game
0,0.380427,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.621224,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.669088,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.157734,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.123856,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2836,-0.623716,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2837,1.288079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2838,-0.705781,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2839,-0.026953,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0       1
1       1
2       1
3       1
4       0
       ..
2836    1
2837    1
2838    0
2839    1
2840    1
Name: gender, Length: 2841, dtype: int64

Unnamed: 0,average_daily_events,location_cluster_-1,location_cluster_0,location_cluster_1,location_cluster_2,location_cluster_3,location_cluster_4,location_cluster_5,location_cluster_6,location_cluster_7,...,app_categories_quality,app_categories_reading platform,app_categories_realistic style comic,app_categories_service,app_categories_show,app_categories_takeaway ordering,app_categories_tourism product,app_categories_travel,app_categories_video,app_categories_zombies game
0,2.356397,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.523348,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.073159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.744514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.057478,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.480375,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1214,-0.893056,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1215,0.478324,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1216,-0.013296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,device_id,gender
0,-4968154927622700000,0
1,5164709194749140000,1
2,-446534884923407000,1
3,4929004728683190000,1
4,-6540623292245040000,0
...,...,...
1213,3558602119006800000,0
1214,-3049092807223440000,0
1215,4610975622206370000,1
1216,6339023951586040000,1


In [10]:
gender_models_sc1 = gen_stacking_models()
cross_val_check(gender_models_sc1, X_train, y_train)

Accuracy: 0.59 (+/- 0.01) [LogisticRegression]
Accuracy: 0.56 (+/- 0.02) [RandomForest]
Accuracy: 0.55 (+/- 0.02) [StackingClassifier]


In [11]:
# Tune stacking model
search_result_gender_sc1 = tune_stacking_models(
    gender_models_sc1,
    X_train,
    y_train,
    cv=3,
    n_iter=10, # Quick tune
)
print_search_stats(search_result_gender_sc1)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
0.603 +/- 0.01 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 260, 'final_estimator__min_child_weight': 1, 'final_estimator__max_depth': 4, 'final_estimator__learning_rate': 0.01, 'final_estimator__gamma': 5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 2, 'RandomForest__min_samples_leaf': 7, 'RandomForest__max_depth': 80}
0.585 +/- 0.01 {'final_estimator__subsample': 1.0, 'final_estimator__n_estimators': 100, 'final_estimator__min_child_weight': 10, 'final_estimator__max_depth': 3, 'final_estimator__learning_rate': 0.05, 'final_estimator__gamma': 0.5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 6, 'RandomForest__min_samples_leaf': 4, 'RandomForest__max_depth': 30}
0.600 +/- 0.01 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 60, 'final_estimator__min_

In [12]:
# # Fit best model on train data / predict on test data
gender_model_sc1 = search_result_gender_sc1.best_estimator_
gender_model_sc1_fit = gender_model_sc1.fit(X_train, y_train)
gender_preds_sc1 = gender_model_sc1_fit.predict_proba(X_test)

# Get IDs and predictions
preddf = y_test.copy()

for cls in search_result_gender_sc1.classes_:
    # Probabilities for classes (1,0)
    preddf['target_' + str(cls)] = [i[cls] for i in gender_preds_sc1]

# Look at predictions
preddf.head()

Unnamed: 0,device_id,gender,target_0,target_1
0,-4968154927622700000,0,0.260571,0.739429
1,5164709194749140000,1,0.173655,0.826345
2,-446534884923407000,1,0.338526,0.661474
3,4929004728683190000,1,0.275155,0.724845
4,-6540623292245040000,0,0.183865,0.816135


### b) Scenario 2: Devices without event data

In [13]:
# Format train/test data
X_train = segment_2_train.drop(columns=['device_id', 'gender', 'age_group'])
y_train = segment_2_train['gender']

X_test = segment_2_test.drop(columns=['device_id', 'gender', 'age_group'])
y_test = segment_2_test[['device_id', 'gender']]

for df, label in zip(
    [
        X_train,
        y_train,
        X_test,
        y_test,
    ],
    [
        'X_train',
        'y_train',
        'X_test',
        'y_test',
    ],
):
    display(HTML(f'<h2>{label}</h2>'))
    display(df)

Unnamed: 0,phone_brand_AUX,phone_brand_Bacardi,phone_brand_Bifer,phone_brand_CUBE,phone_brand_Changhong,phone_brand_Cong,phone_brand_Coolpad,phone_brand_Ctyon,phone_brand_Daq,phone_brand_Gionee,...,phone_brand_smallt,phone_brand_vivo,phone_brand_weimi,phone_brand_weitu,phone_brand_wpf,phone_brand_xiangmi,phone_brand_ximi,phone_brand_yougo,phone_brand_youmi,phone_brand_yuxin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0        1
1        1
2        1
3        1
4        1
        ..
49541    0
49542    1
49543    1
49544    1
49545    0
Name: gender, Length: 49546, dtype: int64

Unnamed: 0,phone_brand_AUX,phone_brand_Bacardi,phone_brand_Bifer,phone_brand_CUBE,phone_brand_Changhong,phone_brand_Cong,phone_brand_Coolpad,phone_brand_Ctyon,phone_brand_Daq,phone_brand_Gionee,...,phone_brand_smallt,phone_brand_vivo,phone_brand_weimi,phone_brand_weitu,phone_brand_wpf,phone_brand_xiangmi,phone_brand_ximi,phone_brand_yougo,phone_brand_youmi,phone_brand_yuxin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,device_id,gender
0,-191669847070955000,0
1,2403589567148540000,1
2,4604662545429270000,0
3,4019394794123470000,0
4,-1021613832219450000,1
...,...,...
21230,3557324664602540000,1
21231,-931978254629029000,0
21232,-2171067714073140000,0
21233,-7956453462733460000,1


In [14]:
gender_models_sc2 = gen_stacking_models()
cross_val_check(gender_models_sc2, X_train, y_train)

Accuracy: 0.57 (+/- 0.00) [LogisticRegression]
Accuracy: 0.56 (+/- 0.00) [RandomForest]
Accuracy: 0.55 (+/- 0.01) [StackingClassifier]


In [15]:
# Tune stacking model
search_result_gender_sc2 = tune_stacking_models(
    gender_models_sc2,
    X_train,
    y_train,
    cv=3,
    n_iter=10, # Quick tune
)
print_search_stats(search_result_gender_sc2)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
0.565 +/- 0.00 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 260, 'final_estimator__min_child_weight': 1, 'final_estimator__max_depth': 4, 'final_estimator__learning_rate': 0.01, 'final_estimator__gamma': 5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 2, 'RandomForest__min_samples_leaf': 7, 'RandomForest__max_depth': 80}
0.564 +/- 0.00 {'final_estimator__subsample': 1.0, 'final_estimator__n_estimators': 100, 'final_estimator__min_child_weight': 10, 'final_estimator__max_depth': 3, 'final_estimator__learning_rate': 0.05, 'final_estimator__gamma': 0.5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 6, 'RandomForest__min_samples_leaf': 4, 'RandomForest__max_depth': 30}
0.564 +/- 0.00 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 60, 'final_estimator__min_

In [16]:
# # Fit best model on train data / predict on test data
gender_model_sc2 = search_result_gender_sc2.best_estimator_
gender_model_sc2_fit = gender_model_sc2.fit(X_train, y_train)
gender_preds_sc2 = gender_model_sc2_fit.predict_proba(X_test)

# Get IDs and predictions
preddf = y_test.copy()

for cls in search_result_gender_sc2.classes_:
    # Probabilities for classes (1,0)
    preddf['target_' + str(cls)] = [i[cls] for i in gender_preds_sc2]

# Look at predictions
preddf.head()

Unnamed: 0,device_id,gender,target_0,target_1
0,-191669847070955000,0,0.360289,0.639711
1,2403589567148540000,1,0.398954,0.601046
2,4604662545429270000,0,0.459624,0.540376
3,4019394794123470000,0,0.440108,0.559892
4,-1021613832219450000,1,0.29691,0.70309


## 3. Age group prediction model

### a) Scenario 1: Devices with event data

In [17]:
# Format train/test data
X_train = segment_1_train.drop(columns=['device_id', 'gender', 'age_group'])
y_train = segment_1_train['age_group']

X_test = segment_1_test.drop(columns=['device_id', 'gender', 'age_group'])
y_test = segment_1_test[['device_id', 'age_group']]

for df, label in zip(
    [
        X_train,
        y_train,
        X_test,
        y_test,
    ],
    [
        'X_train',
        'y_train',
        'X_test',
        'y_test',
    ],
):
    display(HTML(f'<h2>{label}</h2>'))
    display(df.info())
    display(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2841 entries, 0 to 2840
Columns: 1856 entries, average_daily_events to app_categories_zombies game
dtypes: float64(1856)
memory usage: 40.2 MB


None

Unnamed: 0,average_daily_events,location_cluster_-1,location_cluster_0,location_cluster_1,location_cluster_2,location_cluster_3,location_cluster_4,location_cluster_5,location_cluster_6,location_cluster_7,...,app_categories_quality,app_categories_reading platform,app_categories_realistic style comic,app_categories_service,app_categories_show,app_categories_takeaway ordering,app_categories_tourism product,app_categories_travel,app_categories_video,app_categories_zombies game
0,0.380427,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.621224,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.669088,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.157734,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.123856,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2836,-0.623716,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2837,1.288079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2838,-0.705781,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2839,-0.026953,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.series.Series'>
RangeIndex: 2841 entries, 0 to 2840
Series name: age_group
Non-Null Count  Dtype
--------------  -----
2841 non-null   int64
dtypes: int64(1)
memory usage: 22.3 KB


None

0       2
1       2
2       0
3       2
4       1
       ..
2836    2
2837    1
2838    0
2839    2
2840    2
Name: age_group, Length: 2841, dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Columns: 1856 entries, average_daily_events to app_categories_zombies game
dtypes: float64(1856)
memory usage: 17.2 MB


None

Unnamed: 0,average_daily_events,location_cluster_-1,location_cluster_0,location_cluster_1,location_cluster_2,location_cluster_3,location_cluster_4,location_cluster_5,location_cluster_6,location_cluster_7,...,app_categories_quality,app_categories_reading platform,app_categories_realistic style comic,app_categories_service,app_categories_show,app_categories_takeaway ordering,app_categories_tourism product,app_categories_travel,app_categories_video,app_categories_zombies game
0,2.356397,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.523348,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.073159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.744514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.057478,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213,0.480375,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1214,-0.893056,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1215,0.478324,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1216,-0.013296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1218 entries, 0 to 1217
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   device_id  1218 non-null   int64
 1   age_group  1218 non-null   int64
dtypes: int64(2)
memory usage: 19.2 KB


None

Unnamed: 0,device_id,age_group
0,-4968154927622700000,3
1,5164709194749140000,1
2,-446534884923407000,1
3,4929004728683190000,3
4,-6540623292245040000,2
...,...,...
1213,3558602119006800000,1
1214,-3049092807223440000,2
1215,4610975622206370000,1
1216,6339023951586040000,1


In [18]:
age_models_sc1 = gen_stacking_models()
cross_val_check(age_models_sc1, X_train, y_train)

Accuracy: 0.61 (+/- 0.00) [LogisticRegression]
Accuracy: 0.58 (+/- 0.01) [RandomForest]
Accuracy: 0.58 (+/- 0.00) [StackingClassifier]


In [19]:
# Tune XGBoost meta learner
search_result_age_sc1 = tune_stacking_models(
    age_models_sc1,
    X_train,
    y_train,
    cv=3,
    n_iter=10, # Quick tune
)
print_search_stats(search_result_age_sc1)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
0.613 +/- 0.01 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 260, 'final_estimator__min_child_weight': 1, 'final_estimator__max_depth': 4, 'final_estimator__learning_rate': 0.01, 'final_estimator__gamma': 5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 2, 'RandomForest__min_samples_leaf': 7, 'RandomForest__max_depth': 80}
0.602 +/- 0.01 {'final_estimator__subsample': 1.0, 'final_estimator__n_estimators': 100, 'final_estimator__min_child_weight': 10, 'final_estimator__max_depth': 3, 'final_estimator__learning_rate': 0.05, 'final_estimator__gamma': 0.5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 6, 'RandomForest__min_samples_leaf': 4, 'RandomForest__max_depth': 30}
0.614 +/- 0.01 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 60, 'final_estimator__min_

In [20]:
# # Fit best model on train data / predict on test data
age_model_sc1 = search_result_age_sc1.best_estimator_
age_model_sc1_fit = age_model_sc1.fit(X_train, y_train)
age_preds_sc1 = age_model_sc1_fit.predict_proba(X_test)

# Get IDs and predictions
preddf = y_test.copy()

for cls in search_result_age_sc1.classes_:
    # Probabilities for classes (1,0)
    preddf['target_' + str(cls)] = [i[cls] for i in age_preds_sc1]

# Look at predictions
preddf.head()

Unnamed: 0,device_id,age_group,target_0,target_1,target_2,target_3
0,-4968154927622700000,3,0.161505,0.29781,0.319064,0.22162
1,5164709194749140000,1,0.254838,0.34933,0.230596,0.165237
2,-446534884923407000,1,0.256015,0.322561,0.247072,0.174353
3,4929004728683190000,3,0.164227,0.28116,0.326102,0.228511
4,-6540623292245040000,2,0.173422,0.312385,0.316651,0.197542


### b) Scenario 2: Devices without event data

In [21]:
# Format train/test data
X_train = segment_2_train.drop(columns=['device_id', 'gender', 'age_group'])
y_train = segment_2_train['age_group']

X_test = segment_2_test.drop(columns=['device_id', 'gender', 'age_group'])
y_test = segment_2_test[['device_id', 'age_group']]

for df, label in zip(
    [
        X_train,
        y_train,
        X_test,
        y_test,
    ],
    [
        'X_train',
        'y_train',
        'X_test',
        'y_test',
    ],
):
    display(HTML(f'<h2>{label}</h2>'))
    display(df)

Unnamed: 0,phone_brand_AUX,phone_brand_Bacardi,phone_brand_Bifer,phone_brand_CUBE,phone_brand_Changhong,phone_brand_Cong,phone_brand_Coolpad,phone_brand_Ctyon,phone_brand_Daq,phone_brand_Gionee,...,phone_brand_smallt,phone_brand_vivo,phone_brand_weimi,phone_brand_weitu,phone_brand_wpf,phone_brand_xiangmi,phone_brand_ximi,phone_brand_yougo,phone_brand_youmi,phone_brand_yuxin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0        1
1        0
2        1
3        1
4        1
        ..
49541    2
49542    0
49543    0
49544    1
49545    2
Name: age_group, Length: 49546, dtype: int64

Unnamed: 0,phone_brand_AUX,phone_brand_Bacardi,phone_brand_Bifer,phone_brand_CUBE,phone_brand_Changhong,phone_brand_Cong,phone_brand_Coolpad,phone_brand_Ctyon,phone_brand_Daq,phone_brand_Gionee,...,phone_brand_smallt,phone_brand_vivo,phone_brand_weimi,phone_brand_weitu,phone_brand_wpf,phone_brand_xiangmi,phone_brand_ximi,phone_brand_yougo,phone_brand_youmi,phone_brand_yuxin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,device_id,age_group
0,-191669847070955000,3
1,2403589567148540000,2
2,4604662545429270000,1
3,4019394794123470000,2
4,-1021613832219450000,2
...,...,...
21230,3557324664602540000,1
21231,-931978254629029000,1
21232,-2171067714073140000,0
21233,-7956453462733460000,0


In [22]:
age_models_sc2 = gen_stacking_models()
cross_val_check(age_models_sc2, X_train, y_train)

Accuracy: 0.56 (+/- 0.00) [LogisticRegression]
Accuracy: 0.56 (+/- 0.00) [RandomForest]
Accuracy: 0.56 (+/- 0.00) [StackingClassifier]


In [23]:
# Tune stacking model
search_result_age_sc2 = tune_stacking_models(
    age_models_sc2,
    X_train,
    y_train,
    cv=3,
    n_iter=10, # Quick tune
)
print_search_stats(search_result_age_sc2)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
0.562 +/- 0.00 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 260, 'final_estimator__min_child_weight': 1, 'final_estimator__max_depth': 4, 'final_estimator__learning_rate': 0.01, 'final_estimator__gamma': 5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 2, 'RandomForest__min_samples_leaf': 7, 'RandomForest__max_depth': 80}
0.561 +/- 0.00 {'final_estimator__subsample': 1.0, 'final_estimator__n_estimators': 100, 'final_estimator__min_child_weight': 10, 'final_estimator__max_depth': 3, 'final_estimator__learning_rate': 0.05, 'final_estimator__gamma': 0.5, 'final_estimator__colsample_bytree': 1.0, 'RandomForest__n_estimators': 75, 'RandomForest__min_samples_split': 6, 'RandomForest__min_samples_leaf': 4, 'RandomForest__max_depth': 30}
0.562 +/- 0.00 {'final_estimator__subsample': 0.6, 'final_estimator__n_estimators': 60, 'final_estimator__min_

In [24]:
# # Fit best model on train data / predict on test data
age_model_sc2 = search_result_age_sc2.best_estimator_
age_model_sc2_fit = age_model_sc2.fit(X_train, y_train)
age_preds_sc2 = age_model_sc2_fit.predict_proba(X_test)

# Get IDs and predictions
preddf = y_test.copy()

for cls in search_result_age_sc2.classes_:
    # Probabilities for classes (1,0)
    preddf['target_' + str(cls)] = [i[cls] for i in age_preds_sc2]

# Look at predictions
preddf.head()

Unnamed: 0,device_id,age_group,target_0,target_1,target_2,target_3
0,-191669847070955000,3,0.227483,0.288712,0.255148,0.228657
1,2403589567148540000,2,0.223313,0.304401,0.26465,0.207636
2,4604662545429270000,1,0.282612,0.303827,0.224053,0.189508
3,4019394794123470000,2,0.276634,0.301501,0.232793,0.189072
4,-1021613832219450000,2,0.244803,0.291603,0.258171,0.205422


## 4. Save the models for evaluation later

In [25]:
import pickle

In [26]:
os.makedirs('deploy', exist_ok=True)

with open('deploy/gender_model_sc1_fit.pickle', 'wb') as f:
  pickle.dump(gender_model_sc1_fit, f)

with open('deploy/gender_model_sc2_fit.pickle', 'wb') as f:
  pickle.dump(gender_model_sc2_fit, f)

with open('deploy/age_model_sc1_fit.pickle', 'wb') as f:
  pickle.dump(age_model_sc1_fit, f)

with open('deploy/age_model_sc2_fit.pickle', 'wb') as f:
  pickle.dump(age_model_sc2_fit, f)

In [27]:
os.listdir('deploy')

['gender_model_sc1_fit.pickle',
 'gender_model_sc2_fit.pickle',
 'age_model_sc1_fit.pickle',
 'age_model_sc2_fit.pickle',
 'input_processor.pickle',
 'age_label_encoder.pickle',
 'gender_label_encoder.pickle',
 'deploy_data.csv']