## Carga de librerías

In [1]:
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regresion lineal
from sklearn.linear_model import LinearRegression

# Importar/ Exportar modelos
from joblib import dump, load

# Metricas
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error

# Librerías para pipeline
# Composicion de pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

## Carga de datos y creación dataframe

In [2]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_train.csv'
df = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])
df.sample(5)

Número de filas:  1569
Número de columnas:  10


Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
750,750,373,336,83,4,0.63,4.58,9.62,1,95.0
364,364,330,297,94,1,1.12,4.9,7.41,0,45.53
908,908,59,306,82,1,2.42,2.0,7.66,0,39.06
382,382,499,312,103,4,4.0,0.05,8.43,0,73.0
335,335,459,312,77,1,0.27,3.0,8.53,1,69.0


In [3]:
# Rename column 'LOR ' to 'LOR'
df.rename(columns={'LOR ':'LOR'}, inplace=True)

## Pipeline

### Preprocesamiento

In [4]:
selected_columns = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']

pre_process = [
    ('initial',ColumnTransformer([
        ("selector", 'passthrough',selected_columns), 
        ('imputer', SimpleImputer(strategy='mean'), ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA']),
        ('binarizer', 'passthrough', ['Research']),
        ('scaler', MinMaxScaler(), ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA']),
    ])),
]

### Creación del modelo

In [5]:
# Creación del modelo
model = [('model', LinearRegression())]

### Construcción del Pipeline

In [6]:
# Create pipe with pre_process, model and export
pipe = Pipeline(pre_process + model)

## Prueba del Pipeline

### Creación de un nuevo modelo

In [7]:
df.dropna(subset=['Admission Points'], inplace=True)

# Drop rows with null values in Admission Points
df = df.dropna(subset=['Admission Points'])

# Creación de la variable objetivo y de las variables explicativas
y = df['Admission Points']
X = df.drop(['Admission Points'], axis=1)

# Prueba del pipeline con el dataframe df
pipe.fit(X,y)

In [8]:
y_pred = pipe.predict(X)

# Get the model metrics
print("R^2: ", pipe.score(X,y))
print("MSE: ", mse(y, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y, y_pred)*100, "%")

R^2:  0.7057974212287683
MSE:  108.98224932625243
MAPE:  13.225994783424674 %


## Predicción con nuevos datos

In [9]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_test.csv'
df_prueba = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])

# Rename column 'LOR ' to 'LOR'
df_prueba.rename(columns={'LOR ':'LOR'}, inplace=True)

df_prueba.sample(5)

Número de filas:  1504
Número de columnas:  10


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
148,93,300,97,2,3.46,3.11,7.83,0
33,340,295,96,2,1.8,1.85,7.5,0
101,277,329,113,5,5.0,4.5,9.45,1
66,460,329,113,4,4.0,3.5,9.36,1
1,49,321,110,3,3.5,5.0,8.85,1


In [10]:
y_pred = pipe.predict(df_prueba)
# Add the predictions to the dataframe
df_prueba['Admission Points'] = y_pred
df_prueba.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
169,409,297,101,3,2.0,4.0,7.67,1,68.159601
75,110,304,103,5,5.0,4.0,8.64,0,78.071815
117,28,295,93,1,2.0,1.98,7.23,0,40.981327
147,256,307,110,4,4.0,4.5,8.37,0,71.260857
64,380,311,105,2,2.0,2.93,7.84,0,53.772818


## Exportar el pipeline

In [11]:
# Usamos la lbreria joblib
filename = './college_application_api_app/assets/pipe.joblib'
# Se guarda
dump(pipe, filename) 

['./college_application_api_app/assets/pipe.joblib']