In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [10]:
import pandas as pd
from sklearn.externals import joblib

In [11]:
texts = []
with open("./gold/gold.txt") as f:
    texts = f.readlines()

In [12]:
df = pd.read_csv("./dataset2.csv")
df.head()

Unnamed: 0,_label,_pos,_text_num,dist_to_next,dist_to_prev,is_next_uppercase,is_prev_uppercase,len_of_next_word,len_of_prev_word,next_punctuation_kind,prev_punctuation_kind,punctuation_kind
0,-1,218,0,62,219,1,1,2,8,.,|,.
1,1,280,0,9,62,1,1,7,1,.,.,.
2,-1,289,0,104,9,1,1,6,7,.,.,.
3,-1,393,0,10,104,1,0,1,6,.,.,.
4,1,403,0,3,10,0,0,1,1,.,.,.


In [13]:
features = ['dist_to_next', 'dist_to_prev',
       'is_next_uppercase', 'is_prev_uppercase', 'len_of_next_word',
       'len_of_prev_word', 'next_punctuation_kind', 'prev_punctuation_kind',
       'punctuation_kind']

In [14]:
data_dict = df[features].T.to_dict().values()

### One-Hot-Encoding

In [15]:
from sklearn.feature_extraction import DictVectorizer

In [16]:
transformer = DictVectorizer(sparse=False)
X = transformer.fit_transform(data_dict)

In [17]:
joblib.dump(transformer, "./gold/models/transformer")

['./gold/models/transformer']

In [18]:
X.shape

(309, 15)

In [19]:
y = df["_label"].as_matrix()

### Train test split 

In [20]:
from sklearn.cross_validation import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=241)

In [22]:
y_train.shape, y_test.shape

((216,), (93,))

# Model Selection

In [23]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV

In [24]:
def search_params(estimator, params, X, y, scoring="roc_auc", cv=5):
    print(estimator, params)
    gsc = GridSearchCV(estimator, params, cv=cv, scoring=scoring)
    gsc.fit(X, y)
    print("Best parameters set found on development set:")
    print()
    print(gsc.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in gsc.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()
    return gsc.best_estimator_, gsc.best_params_ 

In [25]:
def evalute_scores(model_clf, _X_train, _X_test, _y_train, _y_test):
    model_clf.fit(_X_train, _y_train)
    _y_pred = model_clf.predict(_X_test)
    print(classification_report(_y_test, _y_pred))
    print(accuracy_score(_y_test, _y_pred))

In [26]:
def evaluate_model(model_object, params_search_grid, _X_train, _X_test, _y_train, _y_test):
    model_clf, model_params = search_params(estimator=model_object, params=params_search_grid,
                                            X=_X_train, y=_y_train,
                                            scoring="accuracy", cv=3)
    
    evalute_scores(model_clf, _X_train, _X_test, _y_train, _y_test)
    
    return model_clf

In [27]:
from sklearn.calibration import CalibratedClassifierCV

### Model 1. Logistic regression

In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
model_1_params_grid = {
    "penalty": ['l1', 'l2'],
    "C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

model_1 = evaluate_model(LogisticRegression(), model_1_params_grid,
              X_train, X_test, y_train, y_test)

(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0), {'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]})
Best parameters set found on development set:
()
{'penalty': 'l2', 'C': 1.0}
()
Grid scores on development set:
()
0.606 (+/-0.007) for {'penalty': 'l1', 'C': 0.001}
0.630 (+/-0.034) for {'penalty': 'l2', 'C': 0.001}
0.606 (+/-0.007) for {'penalty': 'l1', 'C': 0.01}
0.806 (+/-0.048) for {'penalty': 'l2', 'C': 0.01}
0.875 (+/-0.041) for {'penalty': 'l1', 'C': 0.1}
0.921 (+/-0.034) for {'penalty': 'l2', 'C': 0.1}
0.926 (+/-0.015) for {'penalty': 'l1', 'C': 1.0}
0.940 (+/-0.014) for {'penalty': 'l2', 'C': 1.0}
0.926 (+/-0.027) for {'penalty': 'l1', 'C': 10.0}
0.926 (+/-0.027) for {'penalty': 'l2', 'C': 10.0}
0.931 (+/-0.040) for {'penalty': 'l1', 'C': 100.0}
0.931 (+/-0.040) for {'pena

In [30]:
model_1_cl = CalibratedClassifierCV(model_1)
evalute_scores(model_1_cl, X_train, X_test, y_train, y_test)

             precision    recall  f1-score   support

         -1       0.98      0.93      0.96        60
          1       0.89      0.97      0.93        33

avg / total       0.95      0.95      0.95        93

0.94623655914


### Model 2. AdaBoost

In [18]:
from sklearn.ensemble import AdaBoostClassifier

In [19]:
model_2_params_grid = {
    "n_estimators": list(range(50, 201, 50)),
    "learning_rate": [0.1, 0.3, 0.6, 0.8, 1.0]
}

model_2 = evaluate_model(AdaBoostClassifier(), model_2_params_grid,
              X_train, X_test, y_train, y_test)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None) {'learning_rate': [0.1, 0.3, 0.6, 0.8, 1.0], 'n_estimators': [50, 100, 150, 200]}
Best parameters set found on development set:

{'n_estimators': 50, 'learning_rate': 0.1}

Grid scores on development set:

0.949 (+/-0.048) for {'n_estimators': 50, 'learning_rate': 0.1}
0.949 (+/-0.027) for {'n_estimators': 100, 'learning_rate': 0.1}
0.940 (+/-0.048) for {'n_estimators': 150, 'learning_rate': 0.1}
0.940 (+/-0.048) for {'n_estimators': 200, 'learning_rate': 0.1}
0.949 (+/-0.035) for {'n_estimators': 50, 'learning_rate': 0.3}
0.940 (+/-0.048) for {'n_estimators': 100, 'learning_rate': 0.3}
0.944 (+/-0.039) for {'n_estimators': 150, 'learning_rate': 0.3}
0.940 (+/-0.047) for {'n_estimators': 200, 'learning_rate': 0.3}
0.944 (+/-0.039) for {'n_estimators': 50, 'learning_rate': 0.6}
0.944 (+/-0.060) for {'n_estimators': 100, 'learning_rate': 0.6}
0.935 (+/-0.056) for 

In [38]:
model_2_cl = CalibratedClassifierCV(model_2)
evalute_scores(model_2_cl, X_train, X_test, y_train, y_test)

             precision    recall  f1-score   support

         -1       1.00      0.97      0.98        60
          1       0.94      1.00      0.97        33

avg / total       0.98      0.98      0.98        93

0.978494623656


### Model 3. Gradien Boosting

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
model_3_params_grid = {
    "n_estimators": list(range(50, 201, 50)),
    "max_depth": [1, 2, 3, 4, 5, 6],
    "learning_rate": [0.1, 0.4, 0.7, 1.0]
}

model_3 = evaluate_model(GradientBoostingClassifier(), model_3_params_grid,
              X_train, X_test, y_train, y_test)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False) {'learning_rate': [0.1, 0.4, 0.7, 1.0], 'n_estimators': [50, 100, 150, 200], 'max_depth': [1, 2, 3, 4, 5, 6]}
Best parameters set found on development set:

{'n_estimators': 50, 'learning_rate': 0.4, 'max_depth': 1}

Grid scores on development set:

0.944 (+/-0.024) for {'n_estimators': 50, 'learning_rate': 0.1, 'max_depth': 1}
0.940 (+/-0.036) for {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 1}
0.940 (+/-0.036) for {'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 1}
0.944 (+/-0.041) for {'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 1}
0.935 (+/-0.049) for {'n_estimators': 50, 'learning_rate': 0.1

In [37]:
model_3_cl = CalibratedClassifierCV(model_3)
evalute_scores(model_3_cl, X_train, X_test, y_train, y_test)

             precision    recall  f1-score   support

         -1       0.98      0.98      0.98        60
          1       0.97      0.97      0.97        33

avg / total       0.98      0.98      0.98        93

0.978494623656


### Model 4. SVM

In [22]:
from sklearn.svm import SVC

In [23]:
model_4_params_grid = {
    "kernel": ('linear', 'rbf'),
    "C": [1.0, 10.0, 100.0]
}

model_4 = evaluate_model(SVC(), model_4_params_grid,
              X_train, X_test, y_train, y_test)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) {'C': [1.0, 10.0, 100.0], 'kernel': ('linear', 'rbf')}
Best parameters set found on development set:

{'C': 10.0, 'kernel': 'linear'}

Grid scores on development set:

0.907 (+/-0.036) for {'C': 1.0, 'kernel': 'linear'}
0.694 (+/-0.072) for {'C': 1.0, 'kernel': 'rbf'}
0.921 (+/-0.013) for {'C': 10.0, 'kernel': 'linear'}
0.694 (+/-0.050) for {'C': 10.0, 'kernel': 'rbf'}
0.917 (+/-0.024) for {'C': 100.0, 'kernel': 'linear'}
0.694 (+/-0.050) for {'C': 100.0, 'kernel': 'rbf'}

             precision    recall  f1-score   support

         -1       1.00      0.93      0.97        60
          1       0.89      1.00      0.94        33

avg / total       0.96      0.96      0.96        93

0.956989247312


In [36]:
model_4_cl = CalibratedClassifierCV(model_4)
evalute_scores(model_4_cl, X_train, X_test, y_train, y_test)

             precision    recall  f1-score   support

         -1       1.00      0.97      0.98        60
          1       0.94      1.00      0.97        33

avg / total       0.98      0.98      0.98        93

0.978494623656


# Store Model

In [31]:
joblib.dump(model_1, "./gold/models/model_1")

['./gold/models/model_1',
 './gold/models/model_1_01.npy',
 './gold/models/model_1_02.npy',
 './gold/models/model_1_03.npy']