In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [11]:
df = pd.read_csv('./users_behavior.csv')
df

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.90,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0
...,...,...,...,...,...
3209,122.0,910.98,20.0,35124.90,1
3210,25.0,190.36,0.0,3275.61,0
3211,97.0,634.44,70.0,13974.06,0
3212,64.0,462.32,90.0,31239.78,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
calls       3214 non-null float64
minutes     3214 non-null float64
messages    3214 non-null float64
mb_used     3214 non-null float64
is_ultra    3214 non-null int64
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [13]:
df['calls'] = df['calls'].astype(int)
df['minutes'] = df['minutes'].round().astype(int)
df['messages'] = df['messages'].astype(int)
df['mb_used'] = df['mb_used'].round().astype(int)
df['is_ultra'] = df['is_ultra'].astype(bool)


Divide our sample into 3 datasets, training, test, and valid in the proportions 60 - 20 - 20.

In [14]:
df_train, df_valid_test = train_test_split(df, test_size=0.4, random_state=12345)
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=12345)


In [15]:
print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)


(1928, 5)
(643, 5)
(643, 5)


Divide our columns into vectors (attributes), which we will use to train the model and the target attribute vector - what we want to predict. We also divide it for a valid sample and a training sample. We will train our models on the training one, and check different models on the valid one, such as: DecisionTreeClassifier, RandomForestClassifier, LogisticRegression.

In [16]:
features_train = df_train.drop(['is_ultra'], axis = 1)
target_train = df_train['is_ultra']

features_valid = df_valid.drop(['is_ultra'], axis = 1)
target_valid = df_valid['is_ultra']


Let's build our first model with the logical Regression method. It passes the parameter random_state , which is a pseudo-number that guarantees us that the samples are unchanged during training, which in turn guarantees us that the accuracy is unchanged. We will train on training data, predict valid data, and compare the prediction with the results that have already been prepared.

In [17]:
model = LogisticRegression(random_state=42)

model.fit(features_train, target_train)

predicted_valid = model.predict(features_valid)

print("Accuracy", accuracy_score(target_valid, predicted_valid))


Accuracy 0.7573872472783826




The accuracy is not bad, and we are quite satisfied with the task, but we will try a few more models for prediction. RandomForestClassifier takes two additional parameters n_estimators and max_depth as input. max_depth-denotes the depth of the decision tree, that is, this algorithm builds a decision tree with branches and if else, which asks for a large parameter and makes a prediction of the attribute we are looking for. n_estimators is the number of trees.

In [18]:
forestResult = 0

for estim in range(10, 100, 5):
    for depth in range(1, 10):
        model = RandomForestClassifier(n_estimators=estim, max_depth=depth, random_state=42)

        model.fit(features_train, target_train)

        predicted_valid = model.predict(features_valid)
        accuracy = accuracy_score(target_valid, predicted_valid)
        if accuracy > forestResult:
            forestResult = accuracy
            print('Accuracy:', forestResult, 'Depth:', depth, 'Estim:', estim)



Accuracy: 0.7838258164852255 Depth: 1 Estim: 10
Accuracy: 0.7853810264385692 Depth: 2 Estim: 10
Accuracy: 0.7916018662519441 Depth: 3 Estim: 10
Accuracy: 0.7993779160186625 Depth: 6 Estim: 10
Accuracy: 0.80248833592535 Depth: 7 Estim: 20
Accuracy: 0.8040435458786936 Depth: 7 Estim: 25
Accuracy: 0.8087091757387247 Depth: 9 Estim: 25


We see that for the tree depth of 9 and the number of trees of 25, our model gave the highest accuracy. Let's remember it and try the latest classification model.

In [19]:
treeResult = 0

for depth in range(1, 50):
    model = DecisionTreeClassifier(max_depth=depth, random_state=12345)
    
    model.fit(features_train, target_train)
    
    predicted_valid = model.predict(features_valid)
    accuracy = accuracy_score(target_valid, predicted_valid)
    if accuracy > treeResult:
        treeResult = accuracy
        print('Accuracy:', treeResult, 'Depth:', depth)

Accuracy: 0.7542768273716952 Depth: 1
Accuracy: 0.7822706065318819 Depth: 2
Accuracy: 0.7853810264385692 Depth: 3


Let's look at the table below with the accuracy of the trained models and select the model with maximum accuracy.

In [20]:
table_models = pd.DataFrame([[0.7573872472783826, np.nan, np.nan], [0.7853810264385692, 3, np.nan], [0.8087091757387247, 9, 25]], index = ['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier'],  columns=['Accuracy', 'Depth', 'Estim'])

table_models


Unnamed: 0,Accuracy,Depth,Estim
LogisticRegression,0.757387,,
DecisionTreeClassifier,0.785381,3.0,
RandomForestClassifier,0.808709,9.0,25.0


Let's combine the training date and the valid one in order for our model to train on a large amount of data, data is gold in our century!

In [21]:
df_train_valid = pd.concat([df_train, df_valid])
df_train_valid

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
3027,60,432,26,14751,False
434,33,265,59,17398,False
1226,52,342,68,15462,False
1054,42,226,21,13243,False
1842,30,198,0,8190,False
...,...,...,...,...,...
1999,56,398,4,23683,False
1023,76,601,0,17104,False
748,81,526,15,18879,False
1667,10,63,0,2568,True


Let's test our model.

In [22]:
features = df_train_valid.drop(['is_ultra'], axis = 1)
target = df_train_valid['is_ultra']

features_test = df_test.drop(['is_ultra'], axis = 1)
target_test = df_test['is_ultra']

model = RandomForestClassifier(n_estimators=25, max_depth=9)

model.fit(features, target)

predicted_test = model.predict(features_test)

print("Accuracy", accuracy_score(target_test, predicted_test))


Accuracy 0.8055987558320373


Excellent accuracy for test data, we can release our model in prod. Also, the accuracy on test data is lower than on valid data, which means that our model has not been retrained.