In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
'''
Ideas de 10 classifier showdown
'''

In [None]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

In [None]:
# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    y_predict = clf.predict(X_test)
    acc = accuracy_score(y_test, y_predict)
    print("Accuracy: {:.4%}".format(acc))
    
    y_predict = clf.predict_proba(X_test)
    ll = log_loss(y_test, y_predict)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

In [None]:
'''
Fichero de Marc pasa sacar ideas.
Diccionario de clasificadores.
'''

In [None]:
import pandas as pd
df_train = pd.read_csv("~/Downloads/titanic/train.csv")
df_test = pd.read_csv("~/Downloads/titanic/test.csv")

X_train = pd.get_dummies(df_train[["Sex","Age","SibSp","Parch","Fare","Embarked"]],columns=["Embarked","Sex"], drop_first=True).fillna(0)
y_train = df_train[["Pclass"]]["Pclass"]

X_test= pd.get_dummies(df_test[["Sex","Age","SibSp","Parch","Fare","Embarked"]],columns=["Embarked","Sex"], drop_first=True).fillna(0)
y_test = df_test[["Pclass"]]["Pclass"]


display(X_train.head(),y_train.head())
display(X_test.head(),y_test.head())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Models
svmCLS = SVC(kernel="sigmoid",probability=True, gamma="auto", C=100)
lsrCLS = LogisticRegression(solver="lbfgs", max_iter=1000)
rforest = RandomForestClassifier(n_estimators=10)
rforest50 = RandomForestClassifier(n_estimators=20)

# Strategies
classifiers = [
    {"name":"SVM-v1 1v1", "cls": OneVsOneClassifier(svmCLS) },
    {"name":"Logistic 1v1", "cls": OneVsOneClassifier(lsrCLS) },
    {"name":"RandomForest 1v1", "cls": OneVsOneClassifier(rforest) },
    {"name":"SVM-v1 1v-all", "cls": OneVsRestClassifier(svmCLS) },
    {"name":"Logistic 1v-all", "cls": OneVsRestClassifier(lsrCLS) },
    {"name":"RandomForest 1v-all", "cls": OneVsRestClassifier(rforest) },
    {"name":"RandomForest50 1v-all", "cls": OneVsRestClassifier(rforest50) }
]

# Train Phase
for cls in classifiers:
    classifier = cls["cls"]
    clsName = cls["name"]
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print("{}\t\t{}".format(score,clsName))

In [None]:
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels
import numpy as np

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
from sklearn.metrics import confusion_matrix

rf = classifiers[-1]["cls"]
y_predict = rf.predict(X_test)
classes = pd.Series(index=set(y_test),data=["clase1","clase2","clase3"])
print(y_train.value_counts())
plot_confusion_matrix(y_test,y_predict,classes, normalize=True)