## Carga de librerías

In [15]:
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regresion lineal
from sklearn.linear_model import LinearRegression

# Importar/ Exportar modelos
from joblib import dump, load

# Metricas
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error

# Librerías para pipeline
# Composicion de pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

## Carga de datos y creación dataframe

In [16]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_train.csv'
df = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])
df.sample(5)

Número de filas:  1569
Número de columnas:  10


Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
142,142,228,312,110,2,3.5,3.55,8.53,0,64.0
313,313,486,311,101,2,2.5,3.5,8.34,1,70.0
733,733,417,307,96,3,2.47,2.53,7.66,0,45.53
870,870,189,296,101,5,4.5,3.5,9.36,1,93.0
111,111,384,300,100,3,3.0,3.5,8.26,0,62.0


## Pipeline

### Preprocesamiento

In [22]:
selected_columns = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research']

pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[('drop_columns', 'drop', ['Unnamed: 0', 'Serial No.']),
                                # Se eliminan las filas con valores nulos de la columna 'Admission Points'
                                ('drop_rows', 'drop', df[df['Admission Points'].isnull()].index),
                                # En las columnas "GRE Score","University Rating","SOP","LOR" ,"CGPA","Research" se imputan los valores nulos con la media
                                ('imputer', SimpleImputer(strategy='mean'), ['GRE Score','University Rating','SOP','LOR' ,'CGPA','Research']),
                                # Se cambia la columna "Research" por una columna binaria
                                ('binarizer', 'passthrough', ['Research']),
                                # Se normalizan entre 0 y 1 las columnas "GRE Score","University Rating","SOP","LOR" ,"CGPA" y "TOEFL Score"
                                ('scaler', MinMaxScaler(), ['GRE Score','University Rating','SOP','LOR' ,'CGPA','TOEFL Score'])          
])

### Creación del modelo

In [23]:
# Creación del modelo
model = LinearRegression()

### Construcción del Pipeline

In [24]:
pipe = Pipeline(steps=[('pre_process', pre_process), ('model', model)])

## Prueba del Pipeline

In [25]:
# Creación de la variable objetivo y de las variables explicativas
y = df['Admission Points']
X = df.drop(['Admission Points'], axis=1)

pipe = pipe.fit(X, y)

ValueError: all features must be in [0, 8] or [-9, 0]

In [1]:
# Visualizamos la regresion lineal en cada dimension
f, axs = plt.subplots(1, len(selected_columns), sharey=True, figsize = (12,4))

for i, col in enumerate(selected_columns):
    pendiente = pipe['model'].coef_[i]
    intercepto = pipe['model'].intercept_

    sns.scatterplot(x=col, y='Admission Points', data=df, ax=axs[i])
    sns.lineplot(x=df[col], y=pendiente*df[col] + intercepto, color='red', ax=axs[i])
    axs[i].set_title(col)
    axs[i].set_xlabel('')
    axs[i].set_ylabel('Admission Points')

NameError: name 'plt' is not defined

## Exportar el pipeline

In [104]:
# Usamos la lbreria joblib
filename = './api_app/assets/pipeline_endp2.joblib'
# Se guarda
dump(model, filename) 

['regresion.joblib']