In [1]:

import pandas 
import os
import matplotlib.pyplot as mathPlot
import seaborn as sea
import difflib 
#define a class to help us with the process
class ModelFileHelper(object):
    """Ayuda a dar una descripcion de un fichero y a su carga """
    def __init__(self, csvFile):
     self.csvFile= pandas.read_csv(csvFile)
     self.fileName=csvFile
    def getDescription(self):
        return self.csvFile.describe()

    def getModelTypeDetail(self):
        """Retorna una estructura legible con los tipos de dato del conjunto de datos del csv cargado"""
        return self.__translateTypestoHumanReadable(self.csvFile.dtypes)

    def findDifferences(self, other):
        """Retorna una lista con la comparacion de las columnas y los tipos de dos csv"""
        returnlist = list (difflib.Differ().compare(self.getModelTypeDetail().to_string().splitlines(1), other.getModelTypeDetail().to_string().splitlines(1)))
        returnlist.append("Comparativa de tamaños: ")
        returnlist.append (self.fileName +  " Filas:" + ''.join(self.__tuplaCleanUp(self.csvFile.shape[0:1])) + " Columnas:" +  ''.join(self.__tuplaCleanUp(self.csvFile.shape[1:2])))
        returnlist.append (other.fileName + " Filas:" + ''.join(self.__tuplaCleanUp(other.csvFile.shape[0:1])) + " Columnas:" +  ''.join(self.__tuplaCleanUp(other.csvFile.shape[1:2])))
        return returnlist

    def __tuplaCleanUp(self, tupla):
        result = str(tupla).replace('(','').replace(')','').replace(',','')
        return result

    def __translateTypestoHumanReadable(self, text):
        return text.replace("int64", "Numero").replace("object", "Cadena de texto AlfaNumerica").replace("float64", "Numero (largo)")

#load the train model and store it in a dictionary :
files = { "train" : ModelFileHelper("dataInputs/train.csv"), "test" :   ModelFileHelper("dataInputs/test.csv")} 
 
#describe both files 
print ("*****************************************************************")
print ("Análisis comparativo de tipos")
print ("*****************************************************************")
for index, (clave, valor) in enumerate (files.items()):
    print ("...............................................................")
    print ("File: " + valor.fileName)
    print (valor.getModelTypeDetail())
print ("----------------------------------------------------------------")
print ("Buscando diferencias entre tipos: - Significa eliminado, + significa añadido:")   
for listItem in enumerate (files.get("train").findDifferences(files.get("test"))):
    print (listItem)
#check for null fields:
print ("*****************************************************************")
print ("Buscando campos vacios:")   
print ("*****************************************************************")
for index, (clave, valor) in enumerate (files.items()):
    print ("...............................................................")
    print ("File: " + valor.fileName)
    valor.csvFile.info()
    print (valor.getDescription())  

 
 

*****************************************************************
Análisis comparativo de tipos
*****************************************************************
...............................................................
File: dataInputs/train.csv
PassengerId                          Numero
Survived                             Numero
Pclass                               Numero
Name           Cadena de texto AlfaNumerica
Sex            Cadena de texto AlfaNumerica
Age                          Numero (largo)
SibSp                                Numero
Parch                                Numero
Ticket         Cadena de texto AlfaNumerica
Fare                         Numero (largo)
Cabin          Cadena de texto AlfaNumerica
Embarked       Cadena de texto AlfaNumerica
dtype: object
...............................................................
File: dataInputs/test.csv
PassengerId                          Numero
Pclass                               Numero
Name           Cadena de te

In [8]:
#Correlations
print ("*****************************************************************")
print ("Limpieza del modelo: análisis de las correlaciones ")   
print ("*****************************************************************")
files.get("train").csvFile.corr()




*****************************************************************
Limpieza del modelo: análisis de las correlaciones 
*****************************************************************


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [12]:
#Clean up Cabin Column
print ("*****************************************************************")
print ("Limpieza del modelo: Eliminando campos Irrelevantes ")   
print ("*****************************************************************")
for index, (clave, valor) in enumerate (files.items()):
    print ("...............................................................")
    print ("File: " + valor.fileName)
    print ("Removing Column Cabin: ")
    valor.csvFile.drop('Cabin', axis=1, inplace=True)
    print ("Removing Column Embarked: ")
    valor.csvFile.drop('Embarked', axis=1, inplace=True)
    print (valor.getModelTypeDetail())  


*****************************************************************
Limpieza del modelo: Eliminando campos Irrelevantes (Cabin):
*****************************************************************
...............................................................
File: dataInputs/train.csv
PassengerId                          Numero
Survived                             Numero
Pclass                               Numero
Name           Cadena de texto AlfaNumerica
Sex            Cadena de texto AlfaNumerica
Age                          Numero (largo)
SibSp                                Numero
Parch                                Numero
Ticket         Cadena de texto AlfaNumerica
Fare                         Numero (largo)
Embarked       Cadena de texto AlfaNumerica
dtype: object
...............................................................
File: dataInputs/test.csv
PassengerId                          Numero
Pclass                               Numero
Name           Cadena de texto AlfaNumer

In [None]:
#Infer missing age Data
print ("*****************************************************************")
print ("Limpieza del modelo: Inferir campos de Edad vacios:")   
print ("*****************************************************************")
files.get("train")
#extraer los grupos de poblacion. Vamos a agrupar por 