# Import necessary libraries

In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
#from xgboost import XGBClassifier

In [None]:
selected_columns = ['main_properties_imei','main_transactionId','session_start_jst','unusual_events_behaviour','is_device_turnoff','event_usage_perc','unique_package_count','min_session_gap_counter','is_used_3hours_before','label']
tok_nontok_df = pd.read_csv(actual_path)
tok_nontok_df = tok_nontok_df[selected_columns]
tok_nontok_df.head()


In [6]:
X= tok_nontok_df.iloc[:,3:9].values
y= tok_nontok_df.iloc[:,-1].values

In [7]:
print(tok_nontok_df.shape)

(808, 10)


In [8]:
tok_nontok_df['label'].value_counts()
#NOTE : Imbalanced Data
# 0 - NonTOK , 1- TOK

0    663
1    145
Name: label, dtype: int64

In [107]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
len(X_train),len(X_test)

(646, 162)

In [108]:
n_pos = np.sum(y_test)
n_neg =len(y_test)- n_pos
n_pos,n_neg

(36, 126)

In [109]:
n_pos_ = np.sum(y_train)
n_neg_ =len(y_train)- n_pos
n_pos_,n_neg_

(109, 610)

# Feature Scaling

In [110]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [118]:
model_names= ['KNN','DecisionTree','RandomForest','SVM','KERNAL_SVM','NaiveBayes']
model_configs = [KNeighborsClassifier(n_neighbors=5,metric="minkowski",p=2),
                 DecisionTreeClassifier(criterion="entropy",random_state=0),
                 RandomForestClassifier(n_estimators=10,criterion='entropy', random_state=0),
                 SVC(kernel="linear",random_state=0,max_iter=1000),
                 SVC(kernel="rbf",random_state=0,max_iter=1000),
                 GaussianNB()]

for model_name,model in zip(model_names,model_configs):
    print('\n-------------{}--------------'.format(model_name))
    classifier= model
    
    #Fitting data to model
    classifier.fit(X_train,y_train)
    
    #Predicting the test results
    y_pred = classifier.predict(X_test)
    
    #confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    # cross validation
    f1_scores = cross_val_score(model, X, y, cv=5, scoring='f1')
    print('\nF1-score:', f1_scores)
    
    rec_scores = cross_val_score(model, X, y, cv=5, scoring='recall')
    print('\nRecall:', rec_scores)
    
    pre_scores = cross_val_score(model, X, y, cv=5, scoring='precision')
    print('\nPrecision:', pre_scores)
    
    acc_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print('\nAccuaracy:', acc_scores)


-------------KNN--------------
[[124   2]
 [ 33   3]]

F1-score: [0.         0.05128205 0.15789474 0.11428571 0.05263158]

Recall: [0.         0.03448276 0.10344828 0.06896552 0.03448276]

Precision: [0.         0.1        0.33333333 0.33333333 0.11111111]

Accuaracy: [0.80246914 0.77160494 0.80246914 0.80745342 0.77639752]

-------------DecisionTree--------------
[[101  25]
 [ 28   8]]

F1-score: [0.09836066 0.26865672 0.15625    0.16666667 0.38596491]

Recall: [0.10344828 0.31034483 0.17241379 0.17241379 0.37931034]

Precision: [0.09375    0.23684211 0.14285714 0.16129032 0.39285714]

Accuaracy: [0.66049383 0.69753086 0.66666667 0.68944099 0.7826087 ]

-------------RandomForest--------------
[[122   4]
 [ 33   3]]

F1-score: [0.         0.0952381  0.05555556 0.05882353 0.09756098]

Recall: [0.         0.06896552 0.03448276 0.03448276 0.06896552]

Precision: [0.         0.15384615 0.14285714 0.2        0.16666667]

Accuaracy: [0.78395062 0.7654321  0.79012346 0.80124224 0.77018634]



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# DecisionTree

In [62]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion="entropy",random_state=0)
classifier.fit(X_train,y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [91]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[56,  6],
       [18,  1]])

In [53]:
X_test[:,0:]

array([[-0.04453155, -0.64232968, -0.60196544, -0.63707035,  0.8084561 ,
        -0.77937607],
       [-0.30372256, -0.64232968, -0.33016561,  0.1070742 , -0.03189449,
         1.28307763],
       [ 0.69873643, -0.64232968, -0.53133483, -0.58827399, -0.59212823,
        -0.77937607],
       ...,
       [-0.09597859, -0.64232968,  2.37926684, -0.74686217, -1.43247882,
        -0.77937607],
       [-0.55994373, -0.64232968, -0.47324334,  0.13147238, -1.15236196,
         1.28307763],
       [-0.13836124, -0.64232968, -0.65134081,  0.00948147, -1.43247882,
        -0.77937607]])

# Visualising the Test set results


In [56]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0:].min() - 1, stop = X_set[:, 0:].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1:].min() - 1, stop = X_set[:, 1:].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Decision Tree Classifier (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()


ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 2 