## Carga de librerías

In [23]:
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regresion lineal
from sklearn.linear_model import LinearRegression

# Importar/ Exportar modelos
from joblib import dump, load

# Metricas
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error

# Librerías para pipeline
# Composicion de pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

## Carga de datos y creación dataframe

In [24]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_train.csv'
df = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])
df.sample(5)

Número de filas:  1569
Número de columnas:  10


Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
76,76,397,325,107,3,1.5,3.5,9.11,1,84.0
112,112,193,335,117,5,4.79,4.88,9.52,1,93.5
1360,1360,19,290,113,3,4.0,4.67,8.8,0,63.0
108,108,409,297,108,3,2.0,4.0,7.58,1,57.0
441,441,338,332,118,5,5.0,5.0,8.47,1,


In [25]:
# Rename column 'LOR ' to 'LOR'
df.rename(columns={'LOR ':'LOR'}, inplace=True)

## Pipeline

### Preprocesamiento

In [26]:
selected_columns = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']

pre_process = [
    ('initial',ColumnTransformer([
        ("selector", 'passthrough',selected_columns), 
        ('imputer', SimpleImputer(strategy='mean'), selected_columns),
        ('binarizer', 'passthrough', ['Research']),
        ('scaler', MinMaxScaler(), ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', ]),
    ])),
]

### Creación del modelo

In [27]:
# Creación del modelo
model = [('model', LinearRegression())]

### Construcción del Pipeline

In [28]:
# Create pipe with pre_process, model and export
pipe = Pipeline(pre_process + model)

## Prueba del Pipeline

### Creación de un nuevo modelo

In [29]:
df.dropna(subset=['Admission Points'], inplace=True)

# Drop rows with null values in Admission Points
df = df.dropna(subset=['Admission Points'])

# Creación de la variable objetivo y de las variables explicativas
y = df['Admission Points']
X = df.drop(['Admission Points'], axis=1)

# Prueba del pipeline con el dataframe df
pipe.fit(X,y)

In [30]:
y_pred = pipe.predict(X)

# Get the model metrics
print("R^2: ", pipe.score(X,y))
print("MSE: ", mse(y, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y, y_pred)*100, "%")

R^2:  0.708794068986631
MSE:  107.87219307027276
MAPE:  13.143131985198316 %


## Predicción con nuevos datos

In [35]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_test.csv'
df_prueba = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])

# Rename column 'LOR ' to 'LOR'
df_prueba.rename(columns={'LOR ':'LOR'}, inplace=True)

df_prueba.sample(5)

Número de filas:  1504
Número de columnas:  10


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
99,6,330,115,5,4.5,3.0,9.34,1
36,443,331,116,4,4.5,4.5,9.44,1
97,125,301,106,4,2.5,3.0,8.47,0
85,123,310,106,4,1.5,2.5,8.36,0
118,176,320,111,4,4.5,3.5,8.87,1


In [36]:
y_pred = pipe.predict(df_prueba)
# Add the predictions to the dataframe
df_prueba['Admission Points'] = y_pred
df_prueba.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
67,48,339,119,5,4.5,4.0,9.7,0,88.755913
22,444,321,114,5,4.5,4.5,9.16,1,94.197468
111,376,302,99,2,2.08,2.48,7.65,0,50.880356
109,33,335,116,4,3.31,4.65,9.52,1,91.217863
96,500,327,113,4,4.5,4.5,9.04,0,77.844746


## Exportar el pipeline

In [37]:
# Usamos la lbreria joblib
filename = './api_app/assets/pipe.joblib'
# Se guarda
dump(pipe, filename) 

['./api_app/assets/pipe.joblib']