In [1]:
import pandas as pd
data = pd.read_csv('training_data.csv')
data.describe()
test_well = data[data['Well Name'] == 'SHANKLE']
data = data[data['Well Name'] != 'SHANKLE']
#test_well = data[data['Well Name'] == 'SHRIMPLIN']
#data = data[data['Well Name'] != 'SHRIMPLIN']

In [2]:
features = ['GR', 'ILD_log10', 'DeltaPHI','PHIND','PE','NM_M', 'RELPOS']
feature_vectors = data[features]
facies_labels = data['Facies']
facies_labels.describe()

count    2783.000000
mean        4.558390
std         2.515249
min         1.000000
25%         2.000000
50%         4.000000
75%         7.000000
max         9.000000
Name: Facies, dtype: float64

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(feature_vectors)
scaled_features = scaler.transform(feature_vectors) #ndarray now. 

In [4]:
from sklearn.cross_validation import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(scaled_features, facies_labels,test_size=0.05, random_state=42)
X_train



array([[-0.87705417,  0.33637903, -0.18928865, ...,  1.44783881,
         0.95329909, -1.63528358],
       [-0.19279901,  0.32811041, -0.25469982, ...,  0.61894043,
         0.95329909,  1.12000618],
       [ 0.64782298, -1.20352041,  0.12785645, ..., -0.88896106,
        -1.04898873, -1.26304115],
       ...,
       [ 0.2221964 ,  0.04857211,  0.98018391, ..., -0.74689616,
        -1.04898873,  0.94954002],
       [ 0.53236719, -0.39031519, -0.20911021, ..., -0.96726455,
        -1.04898873,  0.72341145],
       [ 0.34268302, -0.51115421,  1.13875646, ..., -0.78940377,
        -1.04898873, -1.33261918]])

In [5]:
from sklearn.metrics import classification_report,roc_auc_score
target_names = ['SS', 'CSiS', 'FSiS', 'SiSh','MS', 'WS', 'D','PS', 'BS']
y_test = test_well['Facies']
well_features = test_well.drop(['Facies','Formation','Well Name','Depth'],axis=1)
X_test = scaler.transform(well_features)
y_test.describe()

count    449.000000
mean       3.576837
std        2.260688
min        1.000000
25%        2.000000
50%        3.000000
75%        6.000000
max        8.000000
Name: Facies, dtype: float64

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=50,min_samples_leaf=10),n_estimators=500,learning_rate=0.5)
y_score = clf.fit(X_train, y_train).decision_function(X_test) 
#clf.fit(X_train, y_train)

numpy.ndarray

In [7]:
y_cv_pred = clf.predict(X_cv) 
print(classification_report(y_cv, y_cv_pred,target_names=target_names))

             precision    recall  f1-score   support

         SS       1.00      0.50      0.67        16
       CSiS       0.69      0.86      0.77        29
       FSiS       0.69      0.79      0.73        14
       SiSh       0.67      0.57      0.62         7
         MS       1.00      0.28      0.43        18
         WS       0.59      0.83      0.69        23
          D       1.00      0.67      0.80         3
         PS       0.75      0.91      0.82        23
         BS       1.00      1.00      1.00         7

avg / total       0.78      0.73      0.71       140



In [24]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred,target_names=target_names))
#y_score.shape
#roc_auc_score(y_test[:,1],y_score[:,1])

             precision    recall  f1-score   support

         SS       0.71      0.11      0.19        89
       CSiS       0.36      0.85      0.51        89
       FSiS       0.87      0.52      0.65       117
       SiSh       0.07      0.14      0.09         7
         MS       0.00      0.00      0.00        19
         WS       0.75      0.75      0.75        71
          D       0.90      0.53      0.67        17
         PS       0.49      0.72      0.59        40

avg / total       0.64      0.53      0.51       449



  .format(len(labels), len(target_names))


ValueError: Expected array-like (array or non-string sequence), got <bound method Series.ravel of 471    2
472    2
473    2
474    2
475    2
476    2
477    2
478    2
479    2
480    2
481    2
482    2
483    2
484    2
485    2
486    2
487    2
488    2
489    2
490    2
491    2
492    1
493    1
494    1
495    1
496    1
497    1
498    1
499    1
500    1
      ..
890    6
891    6
892    6
893    6
894    6
895    5
896    5
897    5
898    5
899    5
900    5
901    5
902    5
903    5
904    5
905    5
906    5
907    5
908    5
909    4
910    4
911    4
912    4
913    8
914    8
915    8
916    8
917    8
918    4
919    4
Name: Facies, Length: 449, dtype: int64>

In [9]:
n_classes = y_test.unique().shape[0]+1
print(n_classes)


9


In [10]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

ValueError: Can only tuple-index with a MultiIndex

In [None]:
import matplotlib.pyplot as plt
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()