## LOAD DATA


In [10]:
import tensorflow as tf
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 

import seaborn as sns
from matplotlib.pyplot import figure

from tensorflow import keras
from tensorflow.keras import layers
import math
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [11]:
data = pd.read_csv('sub_model_15c3d_t5.csv')

data2 = pd.read_csv('sub_model_15c3d_t6.csv')

In [12]:
print(data['15c3d_5'].value_counts())
print(data2['15c3d_6'].value_counts())

1    2021050
0     763392
Name: 15c3d_5, dtype: int64
1    2024694
0     776481
Name: 15c3d_6, dtype: int64


## DATA PREPROCESSING

### DATA CLEANING

#### DROP OUTLIERS

In [13]:
for i in range(2,6):
  data = data[data['RC_MONEY_' + str(i)] >= 0]
  data2 = data2[data2['RC_MONEY_' + str(i+1)] >= 0]

#### FILLING NULL DATA

In [14]:
data['OS'].fillna("Unknow", inplace = True)
data2['OS'].fillna("Unknow", inplace = True)

#### ENCODING DATA

In [15]:
data = pd.get_dummies(data)
data2 = pd.get_dummies(data2)

data.drop('OS_Unknow', axis = 1, inplace = True)
data2.drop('OS_Unknow', axis =1, inplace = True)
data.drop('PRODUCT_CODE', axis= 1, inplace= True)
data.drop('PROVINCE', axis= 1, inplace= True)
data.drop('TD_5', axis= 1, inplace= True)
data.drop('DATA_5', axis= 1, inplace= True)
data.drop('ONNET_IN_5', axis= 1, inplace= True)
data.drop('ONNET_OUT_5', axis= 1, inplace= True)
data.drop('OFFNET_IN_5', axis= 1, inplace= True)
data.drop('OFFNET_OUT_5', axis= 1, inplace= True)
data.drop('PACK_TIME_5', axis= 1, inplace= True)
data.drop('PACK_MONEY_5', axis= 1, inplace= True)
data.drop('RC_TIME_5', axis= 1, inplace= True)
data.drop('RC_MONEY_5', axis= 1, inplace= True)
data.drop('REG_ON_5', axis= 1, inplace= True)

data2.drop('PRODUCT_CODE', axis= 1, inplace= True)
data2.drop('PROVINCE', axis= 1, inplace= True)
data2.drop('TD_6', axis= 1, inplace= True)
data2.drop('DATA_6', axis= 1, inplace= True)
data2.drop('ONNET_IN_6', axis= 1, inplace= True)
data2.drop('ONNET_OUT_6', axis= 1, inplace= True)
data2.drop('OFFNET_IN_6', axis= 1, inplace= True)
data2.drop('OFFNET_OUT_6', axis= 1, inplace= True)
data2.drop('PACK_TIME_6', axis= 1, inplace= True)
data2.drop('PACK_MONEY_6', axis= 1, inplace= True)
data2.drop('RC_TIME_6', axis= 1, inplace= True)
data2.drop('RC_MONEY_6', axis= 1, inplace= True)
data2.drop('REG_ON_6', axis= 1, inplace= True)

### DATA TRANSFORMATION

In [16]:
from sklearn.preprocessing import MinMaxScaler

data2.drop('ISDN', axis = 1, inplace = True)
data.drop('ISDN', axis = 1, inplace = True)

scaler = MinMaxScaler(feature_range = (0,1))

tmp = scaler.fit_transform(data[data.drop(['15c3d_5','15c3d_2','15c3d_3','15c3d_4','OS_FF','OS_SM'], axis=1).columns])
data[data.drop(['15c3d_5','15c3d_2','15c3d_3','15c3d_4','OS_FF','OS_SM'], axis=1).columns] = tmp
print(data.head())

tmp = scaler.transform(data2[data2.drop(['15c3d_5','15c3d_6','15c3d_3','15c3d_4','OS_FF','OS_SM'], axis=1).columns])
data2[data2.drop(['15c3d_6','15c3d_5','15c3d_3','15c3d_4','OS_FF','OS_SM'], axis=1).columns] = tmp

print(data2.head())

        AGE      TD_2      TD_3      TD_4    DATA_2    DATA_3    DATA_4  \
0  0.054994  0.000000  0.009464  0.000000  0.000276  0.004625  0.000000   
1  0.057613  0.000715  0.001012  0.000798  0.000589  0.000013  0.000026   
2  0.092780  0.000149  0.004604  0.000000  0.000000  0.000156  0.000000   
3  0.049009  0.000000  0.007098  0.007718  0.000000  0.005340  0.012421   
4  0.548822  0.000000  0.004732  0.010291  0.001541  0.003066  0.008059   

   ONNET_IN_2  ONNET_IN_3  ONNET_IN_4  ...  RC_MONEY_4  REG_ON_2  REG_ON_3  \
0    0.000637    0.004552    0.000088  ...    0.000000  0.142857  0.935484   
1    0.001596    0.000125    0.000115  ...    0.000000  0.285714  0.419355   
2    0.000025    0.000007    0.000000  ...    0.000000  0.071429  0.258065   
3    0.000000    0.003432    0.004011  ...    0.020000  0.000000  0.645161   
4    0.000185    0.000366    0.000322  ...    0.026667  1.000000  1.000000   

   REG_ON_4  15c3d_2  15c3d_3  15c3d_4  15c3d_5  OS_FF  OS_SM  
0  0.000000     

Feature names unseen at fit time:
- DATA_5
- OFFNET_IN_5
- OFFNET_OUT_5
- ONNET_IN_5
- ONNET_OUT_5
- ...
Feature names seen at fit time, yet now missing:
- DATA_2
- OFFNET_IN_2
- OFFNET_OUT_2
- ONNET_IN_2
- ONNET_OUT_2
- ...



        AGE      TD_3      TD_4      TD_5    DATA_3    DATA_4    DATA_5  \
0  0.075196  0.005456  0.000473  0.002058  0.004787  0.000702  0.003118   
1  0.329966  0.003492  0.004751  0.002573  0.002193  0.002261  0.005235   
2  0.046016  0.000000  0.004732  0.012864  0.001291  0.002975  0.007417   
3  0.046016  0.000000  0.002366  0.000000  0.000000  0.000637  0.000000   
4  0.279835  0.006538  0.003365  0.005377  0.000000  0.000000  0.000000   

   ONNET_IN_3  ONNET_IN_4  ONNET_IN_5  ...  RC_MONEY_5  REG_ON_3  REG_ON_4  \
0    0.000515    0.000017    0.000099  ...    0.006667  1.107143  0.967742   
1    0.003647    0.001347    0.005638  ...    0.006667  1.071429  0.967742   
2    0.000100    0.001519    0.000207  ...    0.033333  0.428571  0.967742   
3    0.000031    0.001167    0.000000  ...    0.000000  0.214286  0.225806   
4    0.002530    0.002235    0.001284  ...    0.013333  1.071429  0.967742   

   REG_ON_5  15c3d_3  15c3d_4  15c3d_5  15c3d_6  OS_FF  OS_SM  
0  0.933333     

## BUILDING MODELS

In [17]:
for col in data2.columns:
  if(col[-1] == '3'):
    data2 = data2.rename({col : col[:-1] + '2'}, axis=1)
  if(col[-1] == '4'):
    data2 = data2.rename({col : col[:-1] + '3'}, axis=1) 
  if(col[-1] == '5'):
    data2 = data2.rename({col : col[:-1] + '4'}, axis=1)
  if(col[-1] == '6'):
    data2 = data2.rename({col : col[:-1] + '5'}, axis=1)         

In [18]:
# divide test set and training set
X_train = data.drop("15c3d_5", axis= 1)
y_train = data['15c3d_5']

X_test = data2.drop("15c3d_5", axis=1)
y_test = data2['15c3d_5']

## DECISION TREE

In [22]:
dt=DecisionTreeClassifier(random_state=42, max_depth = 10)
dt.fit(X_train,y_train)
score = dt.score(X_train, y_train)
score2 = dt.score(X_test, y_test)

print("Training set accuracy: ", '%.5f'%(score))
print("Test set accuracy: ", '%.5f'%(score2))

y_pred = dt.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


Training set accuracy:  0.83971
Test set accuracy:  0.82609
[[ 428251  348138]
 [ 138971 1885559]]
              precision    recall  f1-score   support

           0       0.75      0.55      0.64    776389
           1       0.84      0.93      0.89   2024530

    accuracy                           0.83   2800919
   macro avg       0.80      0.74      0.76   2800919
weighted avg       0.82      0.83      0.82   2800919



### XGBOOSTING

In [11]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
score = xgb_model.score(X_train, y_train)
score2 = xgb_model.score(X_test, y_test)

print("Training set accuracy: ", '%.5f'%(score))
print("Test set accuracy: ", '%.5f'%(score2))

y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Training set accuracy:  0.84635
Test set accuracy:  0.83541


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[[ 459949  316440]
 [ 144554 1879976]]
              precision    recall  f1-score   support

           0       0.76      0.59      0.67    776389
           1       0.86      0.93      0.89   2024530

    accuracy                           0.84   2800919
   macro avg       0.81      0.76      0.78   2800919
weighted avg       0.83      0.84      0.83   2800919



### GRADIENT BOOSTING

In [12]:
gb = GradientBoostingClassifier(n_estimators=125,max_depth=5)
gb.fit(X_train, y_train) 
score = gb.score(X_train, y_train)
score2 = gb.score(X_test, y_test)

print("Training set accuracy: ", '%.5f'%(score))
print("Test set accuracy: ", '%.5f'%(score2))

y_pred = gb.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Training set accuracy:  0.84319
Test set accuracy:  0.83484
[[ 458159  318230]
 [ 144361 1880169]]
              precision    recall  f1-score   support

           0       0.76      0.59      0.66    776389
           1       0.86      0.93      0.89   2024530

    accuracy                           0.83   2800919
   macro avg       0.81      0.76      0.78   2800919
weighted avg       0.83      0.83      0.83   2800919



### RANDOM FOREST

In [11]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10,
                              random_state=42)
rf.fit(X_train, y_train) 
score = rf.score(X_train, y_train)
score2 = rf.score(X_test, y_test)

print("Training set accuracy: ", '%.5f'%(score))
print("Test set accuracy: ", '%.5f'%(score2))

y_pred = rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

Training set accuracy:  0.83786
Test set accuracy:  0.82932
[[ 421138  355251]
 [ 122798 1901732]]
              precision    recall  f1-score   support

           0       0.77      0.54      0.64    776389
           1       0.84      0.94      0.89   2024530

    accuracy                           0.83   2800919
   macro avg       0.81      0.74      0.76   2800919
weighted avg       0.82      0.83      0.82   2800919



In [12]:
fi = pd.DataFrame({'feature': list(X_train.columns),
                   'importance': rf.feature_importances_}).\
                    sort_values('importance', ascending = False)

In [13]:
fi.head(40)

Unnamed: 0,feature,importance
33,REG_ON_4,0.170098
30,RC_MONEY_4,0.158706
27,RC_TIME_4,0.150047
18,OFFNET_OUT_4,0.082291
3,TD_4,0.077743
36,15c3d_4,0.051858
24,PACK_MONEY_4,0.046524
6,DATA_4,0.041377
9,ONNET_IN_4,0.037697
21,PACK_TIME_4,0.028699


### DEEP NEURAL DECISION FOREST

In [34]:
data_train = pd.concat([X_train, y_train], axis=1)
data_train['15c3d_5'] = data_train['15c3d_5'].astype(str)
data_train['15c3d_5'].replace(["0", "1"], ["y", "n"], inplace= True)

data_test = pd.concat([X_test, y_test], axis=1)
data_test['15c3d_5'] = data_test['15c3d_5'].astype(str)
data_test['15c3d_5'].replace(["0", "1"], ["y", "n"], inplace= True)

data_train.to_csv("train.csv", index=False, header=False)
data_test.to_csv("test.csv", index=False, header=False)


In [35]:
# A list of the numerical feature names.
NUMERIC_FEATURE_NAMES = data_train.drop("15c3d_5", axis=1).columns
# A dictionary of the categorical features and their vocabulary.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    
}
# A list of the columns to ignore from the dataset.
#IGNORE_COLUMN_NAMES = ["fnlwgt"]
# A list of the categorical feature names.
CATEGORICAL_FEATURE_NAMES = []
# A list of all the input features.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES
# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES else ["NA"]
    for feature_name in data_train.columns
]
# The name of the target feature.
TARGET_FEATURE_NAME = "15c3d_5"
# A list of the labels of the target features.
TARGET_LABELS = ["y", "n"]

In [36]:
from tensorflow.keras.layers import StringLookup

target_label_lookup = StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=data_train.columns,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value=" ",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target_label_lookup(target)))
    return dataset.cache()

  return bool(asarray(a1 == a2).all())


In [37]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [38]:
def encode_inputs(inputs):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token, nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and num_oov_indices to 0.
            lookup = StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
            )
            # Convert the string input values into integer indices.
            value_index = lookup(inputs[feature_name])
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
            # Create an embedding layer with the specified dimensions.
            embedding = layers.Embedding(
                input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
            )
            # Convert the index values to embedding representations.
            encoded_feature = embedding(value_index)
        else:
            # Use the numerical features as-is.
            encoded_feature = inputs[feature_name]
            if inputs[feature_name].shape[-1] is None:
                encoded_feature = tf.expand_dims(encoded_feature, -1)

        encoded_features.append(encoded_feature)

    encoded_features = layers.concatenate(encoded_features)
    return encoded_features

In [39]:
class NeuralDecisionTree(keras.Model):
    def __init__(self, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.depth = depth
        self.num_leaves = 2 ** depth
        self.num_classes = num_classes

        # Create a mask for the randomly selected features.
        num_used_features = int(num_features * used_features_rate)
        one_hot = np.eye(num_features)
        sampled_feature_indicies = np.random.choice(
            np.arange(num_features), num_used_features, replace=False
        )
        self.used_features_mask = one_hot[sampled_feature_indicies]

        # Initialize the weights of the classes in leaves.
        self.pi = tf.Variable(
            initial_value=tf.random_normal_initializer()(
                shape=[self.num_leaves, self.num_classes]
            ),
            dtype="float32",
            trainable=True,
        )

        # Initialize the stochastic routing layer.
        self.decision_fn = layers.Dense(
            units=self.num_leaves, activation="sigmoid", name="decision"
        )

    def call(self, features):
        batch_size = tf.shape(features)[0]

        # Apply the feature mask to the input features.
        features = tf.matmul(
            features, self.used_features_mask, transpose_b=True
        )  # [batch_size, num_used_features]
        # Compute the routing probabilities.
        decisions = tf.expand_dims(
            self.decision_fn(features), axis=2
        )  # [batch_size, num_leaves, 1]
        # Concatenate the routing probabilities with their complements.
        decisions = layers.concatenate(
            [decisions, 1 - decisions], axis=2
        )  # [batch_size, num_leaves, 2]

        mu = tf.ones([batch_size, 1, 1])

        begin_idx = 1
        end_idx = 2
        # Traverse the tree in breadth-first order.
        for level in range(self.depth):
            mu = tf.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
            mu = tf.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
            level_decisions = decisions[
                :, begin_idx:end_idx, :
            ]  # [batch_size, 2 ** level, 2]
            mu = mu * level_decisions  # [batch_size, 2**level, 2]
            begin_idx = end_idx
            end_idx = begin_idx + 2 ** (level + 1)

        mu = tf.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
        outputs = tf.matmul(mu, probabilities)  # [batch_size, num_classes]
        return outputs

In [40]:
class NeuralDecisionForest(keras.Model):
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionForest, self).__init__()
        self.ensemble = []
        # Initialize the ensemble by adding NeuralDecisionTree instances.
        # Each tree will have its own randomly selected input features to use.
        for _ in range(num_trees):
            self.ensemble.append(
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
            )

    def call(self, inputs):
        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
        batch_size = tf.shape(inputs)[0]
        outputs = tf.zeros([batch_size, num_classes])

        # Aggregate the outputs of trees in the ensemble.
        for tree in self.ensemble:
            outputs += tree(inputs)
        # Divide the outputs by the ensemble size to get the average.
        outputs /= len(self.ensemble)
        return outputs

In [41]:
learning_rate = 0.01
batch_size = 265
num_epochs = 11
hidden_units = [64, 64]


def run_experiment(model):

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=[keras.metrics.SparseCategoricalAccuracy()],
    )

    print("Start training the model...")
    train_dataset = get_dataset_from_csv(
        "train.csv", shuffle=True, batch_size=batch_size
    )

    model.fit(train_dataset, epochs=num_epochs)
    print("Model training finished")

    print("Evaluating the model on the test data...")
    test_dataset = get_dataset_from_csv("test.csv", batch_size=batch_size)

    _, accuracy = model.evaluate(test_dataset)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

In [42]:
depth = 10
used_features_rate = 1.0
num_classes = len(TARGET_LABELS)


def create_tree_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)

    outputs = tree(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tree_model = create_tree_model()
run_experiment(tree_model)

Start training the model...
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
Model training finished
Evaluating the model on the test data...
Test accuracy: 82.85%


In [43]:
test_dataset = get_dataset_from_csv("test.csv", batch_size=batch_size)
aa = tree_model.predict(test_dataset)
y_pred = []
i = 0
for a in aa:
  if a[0] > a[1]:
    y_pred.append(0)
  else:
    y_pred.append(1)
  i = i + 1
y_pred = np.array(y_pred)
print(y_pred)
y_test = data_test['15c3d_5']
y_test.replace(["y", "n"], ["0", "1"], inplace= True)
y_test = y_test.astype(np.int64)
print(classification_report(y_test,y_pred))

[1 1 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.77      0.54      0.64    776389
           1       0.84      0.94      0.89   2024530

    accuracy                           0.83   2800919
   macro avg       0.81      0.74      0.76   2800919
weighted avg       0.82      0.83      0.82   2800919



In [44]:
num_trees = 25
depth = 5
used_features_rate = 0.5


def create_forest_model():
    inputs = create_model_inputs()
    features = encode_inputs(inputs)
    features = layers.BatchNormalization()(features)
    num_features = features.shape[1]

    forest_model = NeuralDecisionForest(
        num_trees, depth, num_features, used_features_rate, num_classes
    )

    outputs = forest_model(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


forest_model = create_forest_model()

run_experiment(forest_model)

Start training the model...
Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
Model training finished
Evaluating the model on the test data...
Test accuracy: 82.36%


In [45]:
aa = forest_model.predict(test_dataset)
y_pred = []
i = 0
for a in aa:
  if a[0] > a[1]:
    y_pred.append(0)
  else:
    y_pred.append(1)
  i = i + 1
y_pred = np.array(y_pred)
print(y_pred)
y_test = data_test['15c3d_5']
y_test.replace(["y", "n"], ["0", "1"], inplace= True)
y_test = y_test.astype(np.int64)
print(classification_report(y_test,y_pred))

[1 1 1 ... 0 0 0]
              precision    recall  f1-score   support

           0       0.76      0.53      0.62    776389
           1       0.84      0.94      0.88   2024530

    accuracy                           0.82   2800919
   macro avg       0.80      0.73      0.75   2800919
weighted avg       0.82      0.82      0.81   2800919

