In [25]:
import imp
import pandas as pd

from joblib import dump, load

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, mean_absolute_error
from sklearn.model_selection import train_test_split

import scipy.stats as stats

In [26]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

## 1. Carga de los datos

In [27]:
df = pd.read_csv('./data/university_admission_train.csv')

In [28]:
df.shape

(1569, 10)

In [29]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
0,0,479,327,113,4,4.0,2.77,8.88,1,84.47
1,1,446,301,92,1,1.85,1.5,7.71,0,45.08
2,2,336,297,100,1,2.41,1.59,7.89,0,47.42
3,3,20,303,98,3,3.5,3.0,8.5,0,62.0
4,4,432,320,94,2,1.38,3.5,8.78,1,73.0


### Eliminamos la columna Unnamed: 0

In [30]:
df = df.drop(columns=['Unnamed: 0'])

# Eliminamos todas las filas que tengan algun valor nulo

In [31]:
# Se eliminan las filas que tengan valores nulos en la columna Admission Points debido a que esa es la 
# variable objetivo y necesitamos predecirla. 
df = df.dropna()

# Dividimos los datos en train y en test

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(df, df["Admission Points"], test_size=0.2, random_state=1)

## Entrenamiento para un primera modelo de regresion

## Creamos el ColumnTransformer
Aqui se normalizamos entre 0 y 1 las columnas que vamos a utilizar y se eliminan las columnas que no necesitamos.

In [33]:
ct = ColumnTransformer(
    [("gre_preprocess", MinMaxScaler(), ["GRE Score", "University Rating", "SOP", "CGPA", "Research"]),
    ("drop_columns", "drop", ['Serial No.', "TOEFL Score", "LOR "])])

### Creamos el PipeLine 
Creamos el Pipeline con el ColumnTransformer que creamos arriba y con un modelo de regresion lineal

In [34]:
pipeline = Pipeline(
    [
        ('feature_selection', ct),
        ('model', LinearRegression())
    ]
)

In [35]:
pipeline = pipeline.fit(X_train, Y_train)

## 4. Analisis de coeficientes

In [36]:
preds_train = pipeline.predict(X_train)
preds_test = pipeline.predict(X_test)

In [37]:
pipeline.score(X_train, Y_train)

0.70333362037035

In [38]:
pipeline.score(X_test, Y_test)

0.7017157637189441

In [39]:
mae_error_train = mean_absolute_error(Y_train, preds_train)
print("MAE ERRROR TRAIN: ", mae_error_train)

MAE ERRROR TRAIN:  7.9474396268892535


In [40]:
mae_error_test = mean_absolute_error(Y_test, preds_test)
print("MAE ERRROR TEST: ", mae_error_test)

MAE ERRROR TEST:  7.3845448676606535


## 5. Persistencia modelo

In [41]:
filename = './assets/modelo.joblib'

In [42]:
dump(pipeline, filename) 

['./assets/modelo.joblib']

In [43]:
pipeline2 = load(filename)

In [44]:
df['Admission Points'] = pipeline2.predict(df)

In [45]:
pipeline2.score(X_test, Y_test)

0.7017157637189441

In [46]:
df2 = pd.read_csv('./data/university_admission_test.csv')

In [47]:
p = pipeline2.predict(df2)

In [48]:
print(p)

[63.33358268 78.04644914 64.02273941 89.5704423  58.87201605 52.2810251
 87.77233001 87.24044766 60.87427691 90.20514343 51.72726575 52.65564341
 77.5251663  57.59500509 49.15911994 88.35632471 96.51865018 65.73870943
 85.31281881 53.29419037 80.21146843 91.73279637 93.3450307  61.79144281
 57.92763186 45.29338367 96.10263273 52.89155572 74.49138755 66.07403256
 56.30660175 45.15712428 90.2543173  49.19230923 93.2405874  69.31681756
 89.40062192 47.76290199 59.45926836 51.87191442 92.32303455 49.5812899
 53.84729599 71.76713439 56.50177534 53.6969431  69.24011942 85.86672226
 44.18831826 53.82538189 50.72431304 60.67929595 65.89766352 97.28319601
 59.56585077 78.20674779 55.12908854 92.32303455 77.94886234 55.23428611
 75.75181483 88.90572651 50.79845965 54.75494894 52.92288663 61.0122617
 88.38700835 86.80127919 76.11115772 59.41639778 52.0120328  81.6396009
 61.57009752 44.70846197 97.28319601 77.81659493 44.21786278 77.00634322
 65.64632673 81.79536367 89.40062192 52.4463874  99.111