## Carga de librerías

In [1]:
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regresion lineal
from sklearn.linear_model import LinearRegression

# Importar/ Exportar modelos
from joblib import dump, load

# Metricas
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error

# Librerías para pipeline
# Composicion de pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

## Carga de datos y creación dataframe

In [2]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_train.csv'
df = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])
df.sample(5)

Número de filas:  1569
Número de columnas:  10


Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
273,273,122,290,103,3,1.95,2.45,7.5,0,48.52
705,705,66,274,112,4,1.98,0.14,9.51,0,55.0
647,647,160,297,100,1,1.62,2.0,7.9,0,52.0
729,729,218,324,107,4,3.16,3.24,8.99,1,
1087,1087,273,311,88,1,3.08,1.5,8.33,0,50.5


In [3]:
# Rename column 'LOR ' to 'LOR'
df.rename(columns={'LOR ':'LOR'}, inplace=True)

## Pipeline

### Preprocesamiento

In [4]:
selected_columns = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']

pre_process = [
    ('initial',ColumnTransformer([
        ("selector", 'passthrough',selected_columns), 
        ('imputer', SimpleImputer(strategy='mean'), selected_columns),
        ('binarizer', 'passthrough', ['Research']),
        ('scaler', MinMaxScaler(), ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', ]),
    ])),
]

### Creación del modelo

In [5]:
# Creación del modelo
model = [('model', LinearRegression())]

### Construcción del Pipeline

In [6]:
# Create pipe with pre_process, model and export
pipe = Pipeline(pre_process + model)

## Prueba del Pipeline

### Creación de un nuevo modelo

In [7]:
df.dropna(subset=['Admission Points'], inplace=True)

# Drop rows with null values in Admission Points
df = df.dropna(subset=['Admission Points'])

# Creación de la variable objetivo y de las variables explicativas
y = df['Admission Points']
X = df.drop(['Admission Points'], axis=1)

# Prueba del pipeline con el dataframe df
pipe.fit(X,y)

In [8]:
y_pred = pipe.predict(X)

# Get the model metrics
print("R^2: ", pipe.score(X,y))
print("MSE: ", mse(y, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y, y_pred)*100, "%")

R^2:  0.7087978349898327
MSE:  107.87079801961885
MAPE:  13.143109404590863 %


## Predicción con nuevos datos

In [9]:
ruta = 'example_data/202220_Laboratorio_3_data_university_admission_test.csv'
df_prueba = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])

# Rename column 'LOR ' to 'LOR'
df_prueba.rename(columns={'LOR ':'LOR'}, inplace=True)

df_prueba.sample(5)

Número de filas:  1504
Número de columnas:  10


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
4,327,299,100,3,2.0,2.0,8.02,0
127,489,322,112,3,3.0,4.0,8.62,1
134,68,316,107,2,3.5,3.5,8.64,1
164,118,290,103,3,2.07,2.57,7.45,0
152,419,306,101,3,2.66,2.56,7.53,0


In [10]:
y_pred = pipe.predict(df_prueba)
# Add the predictions to the dataframe
df_prueba['Admission Points'] = y_pred
df_prueba.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
4,327,299,100,3,2.0,2.0,8.02,0,58.057804
135,435,306,103,3,3.5,3.0,8.21,0,62.168001
54,252,316,99,2,2.5,3.0,9.0,0,60.305738
74,35,331,112,5,4.0,5.0,9.8,1,98.554686
81,183,299,100,2,3.0,3.5,7.88,0,53.415155


## Exportar el pipeline

In [11]:
# Usamos la lbreria joblib
filename = './college_application_api_app/assets/pipe.joblib'
# Se guarda
dump(pipe, filename) 

['./college_application_api_app/assets/pipe.joblib']