# __Prueba - SQL para Data Science__

Desde __OkCupid__ -aplicación de citas- solicitan el desarrollo de una serie de modelos
predictivos.

Los datos a utilizar se registraron en base a una serie de perfiles públicos dentro de 25
millas de la ciudad de San Francisco activos durante el 2011.

__*Caveat*__: Los permisos para obtener estos datos provinieron del presidente y co-fundador de
OkCupid, Christian Rudder, con la condición que se mantuvieran públicos.


## Requerimientos

### Parte 1: Registro de los archivos en la base de datos.

- Generar una nueva base de datos con la siguiente nomenclatura: apellido_nombre.
- Importar en tablas los archivos `train_cupid.csv` y `test_cupid.csv` a un motor
Postgres, __implementando sólo la librería `psycopg2`__. Las tablas deben contener los
nombres de las columnas y el total de los registros presente en cada archivo.

In [31]:
import psycopg2
import pandas as pd
import numpy as np
import csv
from sqlalchemy import create_engine
import joblib
# Creamos la conexión a la base de datos
conn = psycopg2.connect("user=postgres password=******")
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)

In [32]:
# Creamos la base de datos
cur = conn.cursor()
cur.execute("CREATE DATABASE meneses_benjamin")
conn.commit()
# Cerramos y nos conectamos a la base de datos creada
cur.close()
conn.close()
conn = psycopg2.connect("dbname=meneses_benjamin user=postgres password=******")

In [33]:
# Arreglo con nombre de las columnas
csvHeader = "age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,aquarius,cancer,sagittarius,asian,hispanic / latin,black,indian,pacific islander,native american,middle eastern,colorado,new york,oregon,arizona,hawaii,montana,wisconsin,virginia,spain,nevada,illinois,vietnam,ireland,louisiana,michigan,texas,united kingdom,massachusetts,north carolina,idaho,mississippi,new jersey,florida,minnesota,georgia,utah,washington,west virginia,connecticut,tennessee,rhode island,district of columbia,canada,missouri,germany,pennsylvania,netherlands,switzerland,mexico,ohio,agnosticism,atheism,catholicism,buddhism,judaism,hinduism,islam,pro_dogs,pro_cats,spanish,chinese,french,german,single,seeing_someone,available,employed,income_between_25_50,income_between_50_75,income_over_75,drugs_often,drugs_sometimes,drinks_not at all,drinks_often,drinks_rarely,drinks_socially,drinks_very often,orientation_gay,orientation_straight,sex_m,smokes_sometimes,smokes_trying to quit,smokes_when drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university"
tableCols = list(map(lambda x: x.replace(' ', '_').replace('/', ''), csvHeader.split(',')))
# Construimos las tablas
trainSql = "CREATE TABLE train (id SERIAL PRIMARY KEY"
testSql = "CREATE TABLE test (id SERIAL PRIMARY KEY"
for i, col in enumerate(tableCols):
    trainSql += ", "
    testSql += ", "
    if i > 1:
        trainSql += col + " BOOLEAN NOT NULL"
        testSql += col + " BOOLEAN NOT NULL"
    else:
        trainSql += col + " integer NOT NULL"
        testSql += col + " integer NOT NULL"
trainSql += ");"
testSql += ");"

In [34]:
# Creamos el cursor
cur = conn.cursor()
# Creamos la tabla train y conservamos los cambios
cur.execute(trainSql)
conn.commit()
# Creamos la tabla test y conservamos los cambios
cur.execute(testSql)
conn.commit()
# Cerramos el cursor
cur.close()

In [35]:
# Funcion que lee el csv y lo pobla en la tabla train
def csvToTable(fileName, columns, tableName):
    cur = conn.cursor()
    with open(fileName, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            cur.execute(f"INSERT INTO {tableName} ({', '.join(columns)}) VALUES ({', '.join(['%s' for i in columns])})", [bool(int(float(val))) if i > 1 else int(float(val)) for i, val in enumerate(row)])
    conn.commit()
    cur.close()

In [36]:
try:
    csvToTable('train_cupid.csv', tableCols, 'train')
    csvToTable('test_cupid.csv', tableCols, 'test')
except (Exception, psycopg2.DatabaseError) as error:
    print("Error: unable to insert data")
    print(error)


### Parte 2: Entrenamiento de modelos (3.5 Puntos)
- Ingestar la tabla de training __mediante__ `psycopg2` para el posterior entrenamiento del
modelo.
- Entrenar los siguientes modelos (sin necesidad de ajustar por hiper parámetros):
   - `GradientBoostingClassifier`, `AdaBoostClassifer`,
    `RandomForestClassifier`, `SVC`, `DecisionTreeClassifier`,
    `LogisticRegression`, `BernoulliNB`.
   - Existen tres vectores objetivos a evaluar: single, seeing someone y available.
- Serializar el objeto y preservarlo por cada combinación de modelo entrenado y vector
objetivo.


In [37]:
# importamos los modelos
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB


In [38]:
# Conseguimos la data de entrenamiento
cur = conn.cursor()
trainData = []
try:
    cur.execute("SELECT * FROM train;")
    trainData = cur.fetchall()
    trainData = pd.DataFrame(trainData, columns=['id', *tableCols])
    trainData = pd.concat([trainData.loc[:, ['age', 'height']], pd.get_dummies(trainData.loc[:, 'virgo':]).astype(np.int8)], axis=1)
    cur.close()
except (Exception, psycopg2.DatabaseError) as error:
    print("Error: unable to fetch data")
    print(error)
    conn.rollback()
    cur.close()

In [39]:
trainData.head()

Unnamed: 0,age,height,virgo,taurus,scorpio,pisces,libra,leo,gemini,aries,...,orientation_straight,sex_m,smokes_sometimes,smokes_trying_to_quit,smokes_when_drinking,smokes_yes,body_type_overweight,body_type_regular,education_high_school,education_undergrad_university
0,35,70,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
1,38,68,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,0
2,23,71,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1
3,29,66,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
4,29,67,0,1,0,0,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1


In [40]:
# Entrenamos los modelos para cada vector objetivo
targets = ['single', 'seeing_someone', 'available']
modelsHash = {}
X_train = trainData.drop(columns=targets)
for col in targets:
    modelsHash[col] = {}
    y_train = trainData.loc[:, col]
    modelsHash[col]['DecisionTree'] = DecisionTreeClassifier(random_state=19137).fit(X_train, y_train)
    modelsHash[col]['RandomForest'] = RandomForestClassifier(random_state=19137).fit(X_train, y_train)
    modelsHash[col]['AdaBoost'] = AdaBoostClassifier(random_state=19137).fit(X_train, y_train)
    modelsHash[col]['GradientBoosting'] = GradientBoostingClassifier(random_state=19137).fit(X_train, y_train)
    modelsHash[col]['SVC'] = SVC(random_state=19137).fit(X_train, y_train)
    modelsHash[col]['LogisticRegression'] = LogisticRegression(random_state=19137, penalty='l2', solver='newton-cg', max_iter=500).fit(X_train, y_train)
    modelsHash[col]['BernoulliNB'] = BernoulliNB().fit(X_train, y_train)


### Parte 3: Exportación de predicciones (3.5 Puntos)
- Ingestar la tabla de testing __mediante__ psycopg2 para la posterior predicción del
modelo.
- __En base a los objetos serializados__, predecir y evaluar cuatro queries específicas:
   - __Query 1:__ 'atheism', 'asian', 'employed', 'pro_dogs', 'chinese'.
   - __Query 2:__ 'income_over_75', 'french', 'german','orientation_straight', 'new york'.
   - __Query 3:__ 'education_undergrad_university', 'body_type_regular', 'pro_dogs',
'employed'.
   - __Query 4:__ 'taurus', 'indian', 'washington', 'income_between_50_75', 'hinduism'.
- Cada una de estas queries específicas debe ser registrada en la base de datos.
- La base de datos creada debe contener las tablas:
   - 2 que representan a training y testing.
   - 84 que representan a cada una de las combinaciones entre modelo, vector y
query específica.
- A modo de referencia, la base de datos creada debe contener 86 tablas en total.


In [41]:
# Conseguimos la data de test
cur = conn.cursor()
testData = []
try:
    cur.execute("SELECT * FROM train;")
    testData = cur.fetchall()
    testData = pd.DataFrame(testData, columns=['id', *tableCols])
    testData = pd.concat([testData.loc[:, ['age', 'height']], pd.get_dummies(testData.loc[:, 'virgo':]).astype(np.int8)], axis=1)
    cur.close()
except (Exception, psycopg2.DatabaseError) as error:
    print("Error: unable to fetch data")
    print(error)
    conn.rollback()
    cur.close()
X_test = testData.drop(columns=targets)
# Definimos las utilidades para crear las tablas
engine = create_engine('postgresql://postgres:******@localhost:5432/meneses_benjamin')
def groupQuery(df, cols, target, tableName):
    df = df.groupby(cols)[target].mean()
    df.to_sql(tableName, con=engine, if_exists='replace')
    return df
# Iteramos sobre los modelos para cada vector objetivo
for col in targets:
    y_test = testData.loc[:, col]
    print(f"{col}")
    for model in modelsHash[col]:
        print(f"\t{model}")
        print(f"\t\t{modelsHash[col][model].score(X_test, y_test)}")
        model_name = f'{col}_{model}'
        testData[model_name] = modelsHash[col][model].predict(X_test)
        joblib.dump(modelsHash[col][model], f'models/{model_name}.pkl')
        groupQuery(testData, ['atheism', 'asian', 'employed', 'pro_dogs', 'chinese'], model_name, f'query1_{model_name}')
        groupQuery(testData, ['income_over_75', 'french', 'german', 'orientation_straight', 'new_york'], model_name, f'query2_{model_name}')
        groupQuery(testData, ['education_undergrad_university', 'body_type_regular', 'pro_dogs', 'employed'], model_name, f'query3_{model_name}')
        groupQuery(testData, ['taurus', 'indian', 'washington', 'income_between_50_75', 'hinduism'], model_name, f'query4_{model_name}')

single
	DecisionTree
		0.9992530252477466
	RandomForest
		0.9992530252477466
	AdaBoost
		0.9200239031920721
	GradientBoosting
		0.9218166425974802
	SVC
		0.9198247099248046
	LogisticRegression
		0.9194263233902694
	BernoulliNB
		0.9136995169563269
seeing_someone
	DecisionTree
		0.9996514117822818
	RandomForest
		0.9996514117822818
	AdaBoost
		0.9586175987251631
	GradientBoosting
		0.9588167919924306
	SVC
		0.9585678004083462
	LogisticRegression
		0.9585678004083462
	BernoulliNB
		0.9581196155569942
available
	DecisionTree
		0.9996514117822818
	RandomForest
		0.9996514117822818
	AdaBoost
		0.9586175987251631
	GradientBoosting
		0.9588167919924306
	SVC
		0.9585678004083462
	LogisticRegression
		0.9585678004083462
	BernoulliNB
		0.9581196155569942


In [42]:
conn.close()
engine.dispose()