## **Proyecto STI **

In [None]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn import metrics 
from sklearn.preprocessing import OrdinalEncoder, normalize
from sklearn import preprocessing
from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

In [None]:
data = pd.read_csv("data_titanic_proyecto.csv")
data.head()

In [None]:
data.info()

In [None]:
#validamos cuantas columnas tienen registros NaN
data.isna().sum()

In [None]:
def remove_nan(df, column):
    cond =  data[column].isna()
    data.loc[ cond, column] = data[column].median()

In [None]:
def count_nan_percentage(df,column):
    return(df[column].isna().sum()/len(df[column]))

In [None]:
#Analizamos el procentaje de Nan de la variable Age
print(count_nan_percentage(data,"Age"))
#Al tener un porcentaje manejable vamos a realizar imputacion de mediana
remove_nan(data,"Age")
print("Porcentaje de NANs",count_nan_percentage(data,"Age"))

In [None]:
#Se valida el porcentaje de NANs de Cabin
print(data["Cabin"].isna().sum()/len(data["Cabin"]))
#como el procentaje de datos NANs es demasiado alto se procede a eliminar esta variable del data set
data = data.drop(['Cabin'], axis=1)

In [None]:
#Tambien vemos que existen dos registros de la variable Embarked, al ser de tipo no numerico 
#se procede a eliminar estos registros
cond = data["Embarked"].notna()
#data = data[cond]
data = data.loc[cond]
data.isna().sum()


In [None]:
def graph_corr_matrix(data):
    corrMatrix = data.corr()
    print(corrMatrix)
    sn.heatmap(corrMatrix, annot=True)
    plt.show()
    return corrMatrix

In [None]:
#Tambien se elimina las variables categoricas id,Name,y Ticket que son datos con caracteristicas especificas
data = data.drop(['PassengerId','Name','Ticket'], axis=1)

In [None]:
corrMatrix = graph_corr_matrix(data)

In [None]:
def return_ordinal_encoding(vec_x):
    le = preprocessing.LabelEncoder()
    vec_aux = np.asarray(vec_x.unique())
    #print(vec_aux)
    vec_x = vec_x.values
    #print(vec_x)
    le.fit(vec_aux)
    return le.transform(vec_x)
    
    #enc = OrdinalEncoder()
    #enc.fit(vec_x)
    #OrdinalEncoder()
    #return enc.transform(vec_x)

In [None]:
data['Embarked']           = return_ordinal_encoding(data['Embarked'])
data['passenger_class']    = return_ordinal_encoding(data['passenger_class'])
data['passenger_sex']      = return_ordinal_encoding(data['passenger_sex'])
data['passenger_survived'] = return_ordinal_encoding(data['passenger_survived'])

**Separamos la informacion de forma aleatoria Train-val-test Split con scikit learn**

In [None]:
x_vars = list(data.columns)[: -1]
y_var = list(data.columns)[-1]
X_train, X_test, Y_train, Y_test = train_test_split(data[x_vars], data[y_var], test_size=0.20, random_state=42)
print("Compruebo sumas iguales",len(X_train)+len(X_test),data.shape[0])
X_train, X_val, Y_train,  Y_val  = train_test_split(X_train, Y_train, test_size=0.20, random_state=42)
print("Compruebo sumas iguales",len(X_train)+len(X_val)+len(X_test))

In [None]:
data['Embarked'].unique()

In [None]:
def training_decision_tree(x_train,y_train,x_test, max_depth=None,critery="entropy"):
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion=critery, max_depth=max_depth)

    # Train Decision Tree Classifer
    clf = clf.fit(x_train,y_train)

    #Predict the response for test dataset
    y_pred = clf.predict(x_test)
    
    return clf, y_pred

In [None]:
def calculate_accuracy(y_test, y_pred):
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
#Prueba 1 de el arbol de decision
model, y_pred = training_decision_tree(X_train, Y_train,X_val,3)
calculate_accuracy(y_pred,Y_val)

In [None]:
def visualize_decision_tree(clf, x_vars,file_name):
    dot_data = StringIO()
    export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = x_vars,class_names=['0','1'])
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    graph.write_png(file_name)
    Image(graph.create_png())

In [None]:
visualize_decision_tree(model,x_vars,'arbol.png')

In [None]:
Image('arbol.png')

## **Entrenamiento con SVM**

In [None]:
def train_svm(x_train, y_train, x_test, kernel ="linear", c = 10):
    #Create a svm Classifier
    clf = svm.SVC(kernel=kernel, C=c) # Linear Kernel

    #Train the model using the training sets
    clf.fit(x_train, y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(x_test)
    
    
    return clf, y_pred

In [None]:
model, y_pred = train_svm(X_train, Y_train,X_val,"poly",10000)
calculate_accuracy(y_pred,Y_val)

## **Entrenamiento para algoritmo NAIVE BAYES**

In [None]:
def train_naive_bayes(x_train,y_train, x_test):
    x_not_surv = x_train[y_train == 0]
    x_survided = x_train[y_train == 1]
    
    #Estadisticos requeridos
    
    #media
    mean_survided = np.mean(x_survided)
    mean_not_surv  = np.mean(x_not_surv) 
    
    #desviacion estandar
    sd_survided = np.std(x_survided)
    sd_not_surv  = np.std(x_not_surv) 
    
    stadistic_survided = np.stack((mean_survided,sd_survided)).T
    stadistic_not_surv = np.stack((mean_not_surv,sd_not_surv)).T
    
    #print(stadistic_survided)
    #print(stadistic_not_surv)
    
    #print((sd_not_surv))
    
    prob_class_0 = (1-np.mean(Y_train))
    prob_class_1 = np.mean(Y_train)

    summaries = dict()
    summaries['0'] = (stadistic_not_surv,prob_class_0)
    summaries['1'] = (stadistic_survided,prob_class_1)
    print(len(summaries))

    probabilities=dict()
    
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = class_summaries[1]
        class_summaries = class_summaries[0]
        for i in range(len(class_summaries)):
            print("I va por",class_summaries[i])
            #mean, stdev = class_summaries[i]
            #probabilities[class_value] *= calculate_probability(row[i], mean, stdev)


   

    
    

In [None]:
train_naive_bayes(X_train, Y_train, X_val)

In [None]:
# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = np.exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (np.sqrt(2 * np.pi) * stdev)) * exponent

total_rows = len(X_train)

def calculate_class_probabilities(summaries, row):
    row = row[0] 
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] =class_summaries[1]/float(total_rows)
        class_summaries = class_summaries[0]
        for i in range(len(class_summaries)):
            #print("iter:",i,"Variable:",row[i],"Estadisticos:",class_summaries[i])
            mean, stdev = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities
    

    
total_y0_rows = len(X_train[Y_train == 0])
total_y1_rows = len(X_train[Y_train == 1])

prob_1 =np.asarray([[28.04509091, 13.30357469],
                    [ 0.47272727,  0.72248424],
                    [ 0.40909091 , 0.72357012],
                    [48.60174318 ,68.99254371],
                    [ 1.36818182 , 0.87172054],
                    [ 1.02727273 , 0.86296625],
                    [ 0.32272727 , 0.46751939]]
                  )

prob_2 =np.array([[30.04454023, 12.6258105 ],
                    [ 0.62643678 , 1.32117675],
                    [ 0.37643678 , 0.88031927],
                    [23.22605259 ,33.08035429],
                    [ 1.63505747 , 0.70843096],
                    [ 0.44827586 , 0.73513812],
                    [ 0.85057471 , 0.35650718]]
                  )

def predict(summaries, x_train):
    #labels = list()
    probabilities = calculate_class_probabilities(summaries, x_train)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    #list.append(bestLabel)
    
    return bestLabel#labels


summaries = dict()
summaries['0'] = (prob_1,total_y0_rows)
summaries['1'] = (prob_2,total_y1_rows)

df = X_train.copy()
#print(df.head())
x_1 =df.iloc[[0]] 
#print(x_1.values)
#df['race_label'] = df.apply (lambda row: label_race(row), axis=1)
#print(X_train.apply (lambda row: predict(summaries,row), axis=1))

for i in range(15):
    print(predict(summaries,X_train.iloc[[i]].values))


## Algoritmo de Regresion Logistica

In [None]:
import tensorflow as tf

In [None]:
def predict_logistic_reg(X_test, weights):
    
    tf.reset_default_graph()
    
    tensor_w = tf.placeholder(tf.float32, shape = [None, None], name = "tensor_w")
    tensor_x = tf.placeholder(tf.float32, shape = [None, None], name = "tensor_x")
    
    logits = tf.matmul(tensor_x, tf.transpose(tensor_w))
    sigmoid = tf.nn.sigmoid(logits)
    prediction = tf.round(sigmoid)
    
    with tf.train.MonitoredSession() as session:
        feed_dictionary = {tensor_x: X_test, tensor_w:weights}
        
        predictions = session.run(prediction, feed_dict= feed_dictionary)
        
        return predictions.T

In [None]:
class logistic_regression:
    
    def __init__(self, x,  lambda_parameter, regularization):
        # vector de weights para logits
        self.w = tf.get_variable("weights", dtype = tf.float32, shape = [1,x.shape[1]], initializer = tf.zeros_initializer())
        self.lambda_parameter = tf.constant(lambda_parameter, dtype = tf.float32)
        self.regularization = regularization
    
    # Funcion para el calculo de logits
    def logits(self, x):
        with tf.name_scope("logits"):
            return tf.matmul(x, tf.transpose(self.w))
 
    def prediction(self, x):
        #Method to produce probability distribution for output
        sigmoid = tf.nn.sigmoid(self.logits(x))
        predictions = tf.round(sigmoid)
        return predictions
    
    def accuracy(self, y_hat, y):
        with tf.name_scope("accuracy"):
            accuracy = tf.equal(y_hat, y)
            accuracy = tf.cast(accuracy, tf.int32)
            accuracy = tf.divide(tf.reduce_sum(accuracy), tf.shape(y)[0])
            return accuracy 
    
    def loss(self, x, y):
        cross_entropy = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels = y, logits = self.logits(x)))
        if self.regularization == 'lasso':
            error = tf.add(cross_entropy,tf.multiply(self.lambda_parameter, tf.reduce_sum(tf.abs(self.w))))
        if self.regularization == 'ridge':
            error = tf.add(cross_entropy, tf.multiply(self.lambda_parameter, tf.sqrt(tf.reduce_sum(tf.square(self.w)))))
        return error
    
    def update(self, x_train, y_train, x_test, y_test, learningrate):
        with tf.name_scope("error"):
            train_error = self.loss(x_train, y_train)
            train_error_summary = tf.summary.scalar("TrainError", train_error)
            test_error = self.loss(x_test, y_test)
            test_error_summary = tf.summary.scalar("ValError", test_error)
        with tf.name_scope("accuracy"):
            train_accuracy = self.accuracy(self.prediction(x_train), y_train)
            train_accuracy_summary = tf.summary.scalar("TrainAccuracy", train_accuracy)
            test_accuracy = self.accuracy(self.prediction(x_test), y_test)
            test_accuracy_summary = tf.summary.scalar("ValAccuracy", test_accuracy)
        # update parameters via gradient descent
        gradient = tf.gradients(train_error, [self.w])
        updated_w = tf.assign(self.w, self.w - learningrate * gradient[0])
        
        #return updated_w, train_error, train_accuracy
        
        return updated_w, train_error, test_error, train_accuracy, test_accuracy, train_error_summary, \
        test_error_summary, train_accuracy_summary, test_accuracy_summary

In [None]:
def train_LogisticRegressionClass(lr, epochs, frecprint, x_train, x_test, labels_train, labels_test, batch_size, lambda_parameter, regularization):
    # String para definicion de experimento
    string = './experiments/LogRegression'+ datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +"_lr="+str(lr)+ "_epochs="+str(epochs) + "_batchsize=" \
    + str(batch_size) + '_lambda_parameters=' + str(lambda_parameter) + "_regularization=" + regularization

    g = tf.Graph()
    with g.as_default():
        # Inicialización de objeto de clase LogisticRegressionSoftmax
        modelo = LogisticRegressionSigmoid(x_train, lambda_parameter, regularization)
        # Definicion de placeholders para input de data a grafo
        with tf.name_scope("train_tensors"):
            tensor_x_train = tf.placeholder(tf.float32, [None,x_train.shape[1]], "tensor_x_train")
            tensor_labels_train = tf.placeholder(tf.float32, [None,labels_train.shape[1]], "tensor_labels_train")
        with tf.name_scope("val_tensors"):
            tensor_x_test = tf.placeholder(tf.float32, [x_test.shape[0], x_test.shape[1]], "tensor_x_val")
            tensor_labels_test = tf.placeholder(tf.float32, [labels_test.shape[0], labels_test.shape[1]], "tensor_labels_val")
        # Utilizacion de método update para hacer el entrenamiento
        update_parameters = modelo.update(tensor_x_train, tensor_labels_train, tensor_x_test, tensor_labels_test, lr)
    
        # Writer para utilizacion de tensorboard
        writer = tf.summary.FileWriter(string, g)
        
        #batch_size = 32
        
        total_steps = int((labels_train.shape[0] / batch_size) * epochs)  
        
        with tf.train.MonitoredSession() as session:
            
            for i in range(total_steps + 1):
                # Mini batch gradient descent with batch size 32
                offset = (i * batch_size) % (labels_train.shape[0] - batch_size)
                batch_data = x_train[offset:(offset + batch_size),]
                batch_labels = labels_train[offset:(offset + batch_size),]
                feed_dict = {tensor_x_train:batch_data, tensor_labels_train:batch_labels,tensor_x_test:x_test, tensor_labels_test:labels_test}
                # Entrenamiento
                training = session.run(update_parameters, feed_dict = feed_dict)
                if (i)%frecprint == 0:
                    # Cálculo de pesos para print por cada frecprint epocas
                    weights = session.run(modelo.w, feed_dict = feed_dict)
                    # Agregar datos a writer para poder visualizarlos en tensorboard
                    writer.add_summary(training[5], i)
                    writer.add_summary(training[6], i)
                    writer.add_summary(training[7], i)
                    writer.add_summary(training[8], i)
                    print("Mini-batch:", i, "train error:", training[1], "train accuracy:", training[3])
                    print("Epoch:", int(i//(labels_train.shape[0] / batch_size)+1), "validation error:", training[2], "validation accuracy:", training[4])
                    print("-------------------------------------------------------------------------")
            
            return weights

            writer.close()