## Carga de librerías

In [98]:
seed = 161
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regresion lineal
from sklearn.linear_model import LinearRegression

# Importar/ Exportar modelos
from joblib import dump, load

# Metricas
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_percentage_error

# q-q plots
import scipy.stats as stats

## Carga de datos y creación dataframe

In [99]:
ruta = 'data/processed_data.csv'
df = pd.read_csv(ruta)
print("Número de filas: ", df.shape[0])
print("Número de columnas: ", df.shape[1])
df.sample(5)

Número de filas:  1492
Número de columnas:  8


Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
227,287,100,3,2.5,3.0,7.45,0,52.0
427,323,116,3,4.81,4.51,8.25,1,86.43
51,299,115,2,1.01,2.35,7.59,0,43.62
960,320,110,4,4.35,3.96,8.95,1,85.5
973,295,119,2,2.97,3.47,9.6,0,46.0


### Estandarización de datos

In [100]:
df = (df-df.min())/(df.max()-df.min())
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
count,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0,1492.0
mean,0.610081,0.637846,0.465985,0.592972,0.602602,0.586552,0.465818,0.511277
std,0.214492,0.244508,0.308972,0.247917,0.241887,0.211527,0.498997,0.291625
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.493976,0.529412,0.25,0.39759,0.409,0.433901,0.0,0.222222
50%,0.626506,0.666667,0.5,0.598394,0.6,0.58377,0.0,0.52381
75%,0.774096,0.823529,0.75,0.799197,0.8,0.769634,1.0,0.793651
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Creación del modelo

In [1]:
# Divición variable objetivo y variables predictoras
X = df.drop('Admission Points', axis=1)
Y = df['Admission Points']

# División datos de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Creación del modelo
model = LinearRegression()
model.fit(X_train, Y_train)

NameError: name 'df' is not defined

In [102]:
# Se imprime el intercepto de la regresión
print("Intercepto: ", model.intercept_)
# Se imprimen los coeficientes de la regresión con su respectiva variable
for idx, col_name in enumerate(X_train.columns):
    print("El coeficiente para {} es {}".format(col_name, model.coef_[idx]))


Intercepto:  -0.1183142894816871
El coeficiente para GRE Score es 0.14236603022072775
El coeficiente para TOEFL Score es 0.07166836432596357
El coeficiente para University Rating es 0.36864030407185977
El coeficiente para SOP es 0.06557008012763495
El coeficiente para LOR  es 0.051195737619014615
El coeficiente para CGPA es 0.31249092932501377
El coeficiente para Research es 0.16043731376997478


## Evaluación del modelo

In [103]:
Y_pred = model.predict(X_test)

# Se imprime el coeficiente de determinación R^2
print("R^2: ", model.score(X_test, Y_test))

# Se imprimen las métricas de error
print("MSE: ", mse(Y_test, Y_pred))

R^2:  0.7490740337295853
MSE:  0.020755498994224165
MAPE:  590483688802508.2 %


## Exportar el modelo

In [104]:
# Usamos la lbreria joblib
filename = 'regresion.joblib'
# Se guarda
dump(model, filename) 

['regresion.joblib']