In [1]:
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import itertools
%matplotlib inline

In [3]:
#read the file
data = pd.read_csv("./sample file")

In [None]:
#Shuffle Data
from sklearn.utils import shuffle
data = shuffle(data)
x = data.iloc[:,0:16]
y = data.iloc[:,-1]

#Split data and create Train and Test Dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [None]:
def cross_validaion(model, x, y):
    from sklearn.model_selection import cross_val_score, cross_val_predict
    from sklearn.metrics import confusion_matrix
    import pickle
    scores = cross_val_score(model, x, y, cv=5)
    print("K FOLD ACCURACY")
    print(scores.mean())
    print("**********")
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print("Save the Model")
    #return scores.mean()
    y_pred = cross_val_predict(model, x, y, cv=5)
    conf_mat = confusion_matrix(y, y_pred)
    print("K Fold Validation Confusion Matrix")
    plot_confusion_matrix(conf_mat,[0,1])
    model.fit(x,y)
    pickle.dump(model, open("decision_tree_3_vector_model.pkl", 'wb'))
    plot_classification_data(model, x, y)
    return scores.mean()


def plot_classification_data(model, x, y):
    from sklearn.decomposition import PCA
    predicted_values = model.predict(x)
    pca = PCA(n_components=2)
    proj = pca.fit_transform(x)
    plt.scatter(proj[:, 0], proj[:, 1], c=y, cmap="Paired")
    plt.colorbar()
    plt.scatter(proj[:, 0], proj[:, 1], c=predicted_values, cmap="Paired")
    plt.colorbar()
    
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.figure()
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()

def get_confusion_matrix(y_true, y_pred):
    from sklearn.metrics import confusion_matrix
    cnf = confusion_matrix(y_true, y_pred)
    plot_confusion_matrix(cnf, [0,1])
    
def print_decision_tree(model, filename):
    '''
    to store and open decision tree
    brew install graphviz
    tree.export_graphviz(clf,out_file='./decision_tree.dot')
    dot -Tpng tree.dot -o tree.png
    run this to install graphviz
    sudo chown -R $(whoami) /usr/local/share/info /usr/local/share/man/man3 /usr/local/share/man/man5
    prints in .dot format
    export to png use this command - dot -Tpng filename.dot -o filename.dot
    '''
    from sklearn import tree
    tree.export_graphviz(model,out_file=filename)
    
#Logistic Regression
def logisic_regression(X_train,y_train, X_test, y_test,x ,y):
    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(multi_class="multinomial",solver="saga")
    logreg.fit(X_train, y_train)
    print('Accuracy of Logistic regression classifier on training set: {:.2f}'
         .format(logreg.score(X_train, y_train)))
    print('Accuracy of Logistic regression classifier on test set: {:.2f}'
         .format(logreg.score(X_test, y_test)))
    logreg = LogisticRegression(multi_class="multinomial",solver="saga")
    cross_validaion(logreg, x, y)
    
#Decision Tree
def decision_tree(X_train,y_train, X_test, y_test, x, y):
    '''
    for decision tree the cross validaion is done to get the error estimate but the tree will be trainined
    on the entire dataset like below
    '''
    from sklearn.tree import DecisionTreeClassifier
    clf = DecisionTreeClassifier(max_depth=8).fit(X_train, y_train)
    print('Accuracy of Decision Tree classifier on training set: {:.2f}'
         .format(clf.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'
         .format(clf.score(X_test, y_test)))
    clf = DecisionTreeClassifier(max_depth=8).fit(x, y)
    print("important features++++++++++++++")
    print(clf.feature_importances_)
    print_decision_tree(clf,"./decision_tree.dot")
    print("Train and Test Validation Confusion Matrix")
    get_confusion_matrix(y,clf.predict(x))
    clf = DecisionTreeClassifier(max_depth=8)
    return cross_validaion(clf, x, y)
    
#K nearest neighbours
def nearest_neighbours(X_train,y_train, X_test, y_test, x, y):
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    print('Accuracy of K-NN classifier on training set: {:.2f}'
         .format(knn.score(X_train, y_train)))
    print('Accuracy of K-NN classifier on test set: {:.2f}'
         .format(knn.score(X_test, y_test)))
    knn = KNeighborsClassifier()
    cross_validaion(knn, x, y)
    
def discriminant_analysis(X_train,y_train, X_test, y_test, x, y):
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    print('Accuracy of LDA classifier on training set: {:.2f}'
         .format(lda.score(X_train, y_train)))
    print('Accuracy of LDA classifier on test set: {:.2f}'
         .format(lda.score(X_test, y_test)))
    lda = LinearDiscriminantAnalysis()
    cross_validaion(lda, x, y)

def guasian_naive_bayes(X_train,y_train, X_test, y_test, x, y):
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    print('Accuracy of GNB classifier on training set: {:.2f}'
         .format(gnb.score(X_train, y_train)))
    print('Accuracy of GNB classifier on test set: {:.2f}'
         .format(gnb.score(X_test, y_test)))
    gnb = GaussianNB()
    cross_validaion(gnb, x, y)
    
def neural_net(X_train, y_train, X_test, y_test, x, y):
    clf = MLPClassifier(solver='sgd', alpha=0.001,hidden_layer_sizes=(32,16,2), random_state=1)   
    clf.fit(X_train, y_train)
    predicted_values = clf.predict(X_test)
    actual_values = y_test
    print("Accuracy of Neural Network classifier on train set: {}".format(accuracy_score(y_train, clf.predict(X_train))))
    print("Accuracy of Neural Network classifier on test set: {}".format(accuracy_score(actual_values, predicted_values)))
    clf = MLPClassifier(solver='sgd', alpha=0.001,hidden_layer_sizes=(32,16,2), random_state=1)
    cross_validaion(clf, x, y)
    
def svm(X_train, y_train, X_test, y_test, x, y):
    from sklearn.svm import SVC
    svm = SVC()
    svm.fit(X_train, y_train)
    print('Accuracy of SVM classifier on training set: {:.2f}'
         .format(svm.score(X_train, y_train)))
    print('Accuracy of SVM classifier on test set: {:.2f}'
         .format(svm.score(X_test, y_test)))
    svm = SVC().fit(x, y)
    print("Train And Test Confusion Matrix")
    get_confusion_matrix(y,svm.predict(x))
    svm = SVC()
    return cross_validaion(svm, x, y)

def convert_labels_to_one_not_encode(labels):
    trainLabels = []
    for i in range(0, len(labels)):
        lab = labels[i]
        if lab == 0:
            trainLabels.append(np.array([1,0]))
        else:
            trainLabels.append(np.array([0,1]))
    trainLabels = np.array(trainLabels)
    return trainLabels

def random_forest_classifier(X_train,y_train, X_test, y_test,x , y):
    from sklearn.ensemble import RandomForestClassifier
    #n_estimators mean n different trees
    random_forest = RandomForestClassifier(n_estimators=20, max_depth=5).fit(X_train, y_train)
    print('Accuracy of Decision Tree classifier on training set: {:.2f}'
         .format(random_forest.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'
         .format(random_forest.score(X_test, y_test)))
    random_forest = RandomForestClassifier(n_estimators=20, max_depth=5)
    cross_validaion(random_forest, x, y)
    
def bagging_classifier(X_train,y_train, X_test, y_test,x,y):
    from sklearn.ensemble import BaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    bag_classifier = BaggingClassifier(DecisionTreeClassifier(max_depth=5), max_samples=0.5, max_features=1.0, n_estimators=20)
    bag_classifier.fit(X_train,y_train)
    print('Accuracy of Decision Tree classifier on training set: {:.2f}'
         .format(bag_classifier.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'
         .format(bag_classifier.score(X_test, y_test)))
    bag_classifier = BaggingClassifier(DecisionTreeClassifier(max_depth=5), max_samples=0.5, max_features=1.0, n_estimators=20)
    cross_validaion(bag_classifier, x, y)

def ada_boost_classifier(X_train,y_train, X_test, y_test,x,y):
    #for best algo add max_depth=3 for decisionTree
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    #use max_depth to avoid overfit
    ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=20, learning_rate=1)
    ada_boost.fit(X_train,y_train)
    print('Accuracy of Decision Tree classifier on training set: {:.2f}'
         .format(ada_boost.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'
         .format(ada_boost.score(X_test, y_test)))
    ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=20, learning_rate=1)
    ada_boost.fit(x,y)
    print("Train and Test Validation Confusion Matrix")
    get_confusion_matrix(y,ada_boost.predict(x))
    ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=7), n_estimators=20, learning_rate=1)
    return cross_validaion(ada_boost, x, y)

In [None]:
#Example to call any model
random_forest_classifier(X_train, y_train, X_test, y_test, x, y)