In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, Normalizer, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from joblib import dump


In [16]:
data = pd.read_csv('../assets/university_admission_train.csv', index_col=0)

In [17]:
data.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
58,288,328,107,4,4.0,4.11,9.23,0,89.36
1493,292,300,87,2,1.5,2.0,7.87,0,56.0
984,251,320,93,3,3.0,2.75,6.87,1,74.0
465,470,320,116,4,4.0,3.98,9.26,1,86.5
921,29,295,93,1,2.0,2.0,7.2,0,46.0


In [18]:
data.isnull().sum()

Serial No.            0
GRE Score             0
TOEFL Score           0
University Rating     0
SOP                   0
LOR                   0
CGPA                  0
Research              0
Admission Points     65
dtype: int64

In [19]:
data.dropna(subset=['Admission Points'], inplace=True)

In [20]:
data.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Admission Points     0
dtype: int64

In [21]:
features = ['GRE Score','TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA', 'Research']

In [22]:
pipeline = Pipeline(
    [
        ('feature_selection', ColumnTransformer(
            [
                ('selector', 'passthrough', features),
                ('poly', PolynomialFeatures(degree = 2, include_bias = False), features)
            ]
        )),
        ('normalizer', Normalizer()),
        ('model', LinearRegression())
    ]
)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('Admission Points', axis=1), data['Admission Points'], test_size=0.3, random_state=1)

In [24]:
x_train.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
192,416,310,96,3,2.08,2.34,7.76,0
1556,463,307,105,4,3.0,3.0,8.57,0
1282,152,299,88,5,5.0,5.0,6.49,1
496,350,313,105,3,2.5,3.0,6.61,0
247,9,306,93,1,2.05,1.81,8.12,0


In [25]:
x_test.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
92,205,298,105,3,3.5,4.0,6.68,0
76,397,325,107,3,1.5,3.5,9.11,1
1265,70,328,91,4,4.5,4.68,9.16,1
338,345,295,119,2,1.5,2.0,7.34,0
1410,276,322,110,3,3.42,3.88,8.47,1


In [26]:
y_train.head()

192     45.93
1556    62.00
1282    94.00
496     62.00
247     47.51
Name: Admission Points, dtype: float64

In [27]:
y_test.head()

92      69.0
76      84.0
1265    78.0
338     47.0
1410    78.0
Name: Admission Points, dtype: float64

In [28]:
pipeline.fit(x_train, y_train)

In [29]:
print(pipeline.score(x_test, y_test))

0.7335793878823741


In [30]:
print(pipeline.score(x_train, y_train))

0.7353116185379025


In [31]:
np.sqrt(mean_squared_error(y_test, pipeline.predict(x_test)))

9.333006219428704

In [32]:
y_test.describe()

count    452.000000
mean      66.293850
std       18.101668
min       34.000000
25%       48.880000
50%       66.000000
75%       82.000000
max      145.500000
Name: Admission Points, dtype: float64

In [33]:
dump(pipeline, '../assets/modelo.joblib')

['../assets/modelo.joblib']