In [15]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, Normalizer, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from joblib import dump


In [16]:
data = pd.read_csv('../assets/university_admission_train.csv', index_col=0)

In [17]:
data.sample(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Admission Points
771,42,313,120,2,1.02,2.03,8.48,1,51.01
308,372,295,108,1,1.83,2.0,7.77,0,49.01
337,245,314,116,2,2.5,4.0,8.56,0,63.0
634,375,287,101,2,2.0,2.5,7.66,0,38.25
86,149,299,112,1,1.5,2.91,7.53,0,47.54


In [18]:
data.isnull().sum()

Serial No.            0
GRE Score             0
TOEFL Score           0
University Rating     0
SOP                   0
LOR                   0
CGPA                  0
Research              0
Admission Points     65
dtype: int64

In [19]:
data.dropna(subset=['Admission Points'], inplace=True)

In [20]:
features = ['GRE Score','TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research']

pipeline5 = Pipeline(
    [
        ('feature_selection', ColumnTransformer(
            [
                ('selector', 'passthrough', features),
                ('poly', PolynomialFeatures(degree = 2, include_bias = False), ['SOP', 'TOEFL Score', 'GRE Score', 'University Rating','LOR ','CGPA', 'Research']),
            ]
        )),
        ('normalization', Normalizer()),
        ('model', LinearRegression())
    ]
)

In [21]:
x_train, y_train, x_test, y_test = train_test_split(data.drop('Admission Points', axis=1), data['Admission Points'], test_size=0.3, random_state=1)

In [22]:
x_train.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
192,416,310,96,3,2.08,2.34,7.76,0
1556,463,307,105,4,3.0,3.0,8.57,0
1282,152,299,88,5,5.0,5.0,6.49,1
496,350,313,105,3,2.5,3.0,6.61,0
247,9,306,93,1,2.05,1.81,8.12,0


In [23]:
x_test.head()

192     45.93
1556    62.00
1282    94.00
496     62.00
247     47.51
Name: Admission Points, dtype: float64

In [24]:
y_train.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
92,205,298,105,3,3.5,4.0,6.68,0
76,397,325,107,3,1.5,3.5,9.11,1
1265,70,328,91,4,4.5,4.68,9.16,1
338,345,295,119,2,1.5,2.0,7.34,0
1410,276,322,110,3,3.42,3.88,8.47,1


In [25]:
y_test.head()

92      69.0
76      84.0
1265    78.0
338     47.0
1410    78.0
Name: Admission Points, dtype: float64

In [26]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1052, 8) (452, 8) (1052,) (452,)


In [27]:
x_train.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research
192,416,310,96,3,2.08,2.34,7.76,0
1556,463,307,105,4,3.0,3.0,8.57,0
1282,152,299,88,5,5.0,5.0,6.49,1
496,350,313,105,3,2.5,3.0,6.61,0
247,9,306,93,1,2.05,1.81,8.12,0


In [28]:
pipeline5.fit(x_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [1052, 452]

In [None]:
print(pipeline5.score(x_test, y_test))

In [None]:
dump(pipeline5, 'assets/modelo.joblib')