In [48]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import roc_curve, auc
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

# Pré-processamento dos dados

In [9]:
train = pd.read_table('./orange_small_train.data/orange_small_train.data').replace('\\','/')

In [10]:
train.head(5)

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1526.0,7.0,,,,...,oslk,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,
1,,,,,,525.0,0.0,,,,...,oslk,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,
2,,,,,,5236.0,7.0,,,,...,Al6ZaUT,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,
3,,,,,,,0.0,,,,...,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
4,,,,,,1029.0,7.0,,,,...,oslk,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,


In [11]:
train.shape

(50000, 230)

## Drop columns if all values is NaN

In [12]:
for column in train:
    if (train[column].isna().sum()/float(train.shape[0])) >= 0.6:
        train = train.drop(str(column),axis=1)
train.shape

(50000, 74)

## Replace values in columns with values == NaN

In [13]:
for column in train:
    if train[column].dtype == 'int':
        mean_of_column = train[column].mean()
        train[column] = train[column].replace(np.nan, np.int64(mean_of_column))
    elif train[column].dtype == 'float':
        mean_of_column = train[column].mean()
        train[column] = train[column].replace(np.nan, np.float64(mean_of_column))
    elif train[column].dtype == 'object':
        mod = train[column].mode()
        train[column] = train[column].replace(np.nan, str(mod))

## CATEGORIES TO INT

### Transforming objectives var to categories

In [14]:
train = pd.concat([
        train.select_dtypes([], ['object']),
        train.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
        ], axis=1).reindex(train.columns, axis=1)

In [15]:
cat_columns = train.select_dtypes(['category']).columns
train[cat_columns] = train[cat_columns].apply(lambda x: x.cat.codes)

### Selection of Best Categories

# Correlation between variables

In [16]:
correlation_between_var = train.corr()
correlation_between_var.shape

(74, 74)

In [17]:
variables = correlation_between_var.shape[0]
for column in correlation_between_var:
    if correlation_between_var[column].sum()/float(variables) > 0.8:
        train = train.drop(str(column),axis=1)
train.shape

(50000, 74)

In [18]:
churn = pd.read_csv('./orange_small_train.data/orange_small_train_churn.labels', header=None)

In [19]:
churn[0].value_counts()

-1    46328
 1     3672
Name: 0, dtype: int64

## Insert churn column

In [20]:
train['churn'] = churn

# TRAIN TEST

In [40]:
#X_train, X_test = train_test_split(train.select_dtypes(include=['float64','int64']), test_size=0.3, random_state=int(time.time()))
X_train, X_test = train_test_split(train, test_size=0.3, random_state=int(time.time()))


In [27]:
used_features =[]
for column in train:
    #if str(column) != 'churn' and train[column].dtype != 'object':
    used_features.append(str(column))

## Train the dataset with Gaussian Naive Bayse

In [28]:
gnb = GaussianNB()
gnb.fit(
    X_train[used_features].values,
    X_train['churn']
)
y_pred = gnb.predict(X_test[used_features])

## Evaluating

In [29]:
fpr, tpr, thresholds = roc_curve(X_test['churn'], y_pred, pos_label=1)
auc(fpr, tpr)

0.5526293679649732

# Another alternative -SVC

In [45]:
clf = svm.SVC(gamma=0.001)
clf.fit(X_train[used_features].values, X_train['churn'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [46]:
y_SVC = clf.predict(X_test[used_features])
fpr, tpr, thresholds = roc_curve(X_test['churn'], y_pred, pos_label=1)
auc(fpr, tpr)

0.5155045966353229

# Another alternative - RandomForest

In [56]:
crf = RandomForestClassifier(n_estimators=100, max_depth=10,random_state=10)
crf.fit(X_train[used_features], X_train['churn'])
y_RF = crf.predict(X_test[used_features])
fpr, tpr, thresholds = roc_curve(X_test['churn'], y_pred, pos_label=1)
auc(fpr, tpr)

0.5155045966353229

# Cross Validation

In [64]:
gnb = GaussianNB()
scores = cross_validate(gnb, X_train[used_features], X_train['churn'], cv=10,scoring=('roc_auc'),return_train_score=False)

In [65]:
print(scores)

{'fit_time': array([0.0578177 , 0.0455451 , 0.04529357, 0.04649401, 0.04476094,
       0.04555631, 0.04597664, 0.04508162, 0.04546666, 0.04549265]), 'score_time': array([0.006037  , 0.00654149, 0.00707102, 0.00616693, 0.00601053,
       0.0068059 , 0.00605869, 0.00608826, 0.00676227, 0.00604129]), 'test_score': array([0.575423  , 0.57597815, 0.60113364, 0.59357544, 0.55619661,
       0.56154555, 0.57612349, 0.56814892, 0.55168277, 0.56675987])}
