In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import math
from joblib import dump, load

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import classification_report, plot_confusion_matrix  
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
dataRead = pd.read_csv('aprobados_reprobados.csv')
data = dataRead.drop(['ID','Nota primera oportunidad','Nota segunda oportunidad','intento'],axis=1)
x=data[["Session 2","Session 3","Session 4","Session 5","Session 6"]]
y=data[["notaMax"]]
data.head()


Unnamed: 0,Session 2,Session 3,Session 4,Session 5,Session 6,notaMax,aprobar
0,5.0,0.0,4.5,4.0,2.25,94.5,1
1,4.0,3.5,4.5,4.0,1.0,44.0,0
2,3.5,3.5,4.5,4.0,0.0,85.0,1
3,6.0,4.0,5.0,3.5,2.75,30.0,0
4,5.0,4.0,5.0,4.0,2.75,38.5,0


In [3]:
data.describe()

Unnamed: 0,Session 2,Session 3,Session 4,Session 5,Session 6,notaMax,aprobar
count,115.0,115.0,115.0,115.0,115.0,115.0,115.0
mean,2.886957,2.134783,3.943478,3.030435,1.695652,47.465217,0.426087
std,2.093286,1.545114,1.53149,1.485732,1.219713,33.511828,0.496671
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.5,4.0,3.0,0.125,18.0,0.0
50%,3.5,2.5,4.5,3.5,2.0,49.0,0.0
75%,4.5,3.5,5.0,4.0,2.75,78.0,1.0
max,6.0,4.0,5.0,4.0,4.0,98.0,1.0


## Normalización de los datos

In [4]:
scaler = MinMaxScaler(feature_range=(0, 1))
xNormalizado = scaler.fit_transform(x)
yNormalizado = scaler.fit_transform(y)

## Division del set de entrenamiento y prueba

In [5]:
X_train, X_test, y_train, y_test = train_test_split(xNormalizado, yNormalizado, test_size = 0.10)

In [6]:
def getParametros(X_train,Y_train,param_grid):
    grid = GridSearchCV(SVR(), param_grid,refit=True,cv=5,n_jobs=-1,verbose=2)  # verbose se usa solo para efectos de demostración
    grid.fit(X_train, Y_train)
    return {"C": grid.best_estimator_.C,"Gamma":grid.best_estimator_.gamma,"kernel:":grid.best_estimator_.kernel}

## Tuneando parámetros

In [7]:
param_grid = {
    'C': [0.1,1, 5,10,20,50,75,100], 
    'gamma': [1,0.1,0.01,10,50,100],
    'kernel': ['rbf', 'linear','sigmoid'],
}
getParametros(X_train,y_train,param_grid)


Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    2.9s finished
  return f(**kwargs)


{'C': 20, 'Gamma': 0.1, 'kernel:': 'rbf'}

## Realizando el entrenamiento

In [8]:
svr=SVR(C=5, gamma=1, kernel='rbf', max_iter=-1)
#svr=SVR(kernel='rbf')
svr.fit(X_train,y_train)

  return f(**kwargs)


SVR(C=5, gamma=1)

## Obteniendo metricas

In [9]:
prediccion=svr.predict(X_test)
prediccionEsc = np.resize(prediccion,(12,1))
prediccionEscalada = scaler.inverse_transform(prediccionEsc)
y_testEscOriginal = scaler.inverse_transform(y_test)

accuracy = svr.score(X_train, y_train)
print("Accuracy SVR: {:.3f}".format(accuracy))
testScore = math.sqrt(mean_squared_error(y_testEscOriginal,prediccionEscalada))
print('SVR: %.2f RMSE' % (testScore))

testScore = r2_score(y_test, prediccion)
print('SVR: %.2f R^2' % (testScore))

Accuracy SVR: 0.739
SVR: 17.34 RMSE
SVR: 0.73 R^2


array([[ 0. ],
       [ 0. ],
       [43. ],
       [92.5],
       [78. ],
       [40.5],
       [59. ],
       [79.5],
       [95. ],
       [69. ],
       [ 0. ],
       [46. ]])

In [11]:
prediccionEscalada

array([[17.19149886],
       [14.38863499],
       [29.34835437],
       [74.74969769],
       [69.91196414],
       [60.09293478],
       [85.10730064],
       [77.54776359],
       [67.82424306],
       [44.49588042],
       [ 0.39250351],
       [34.51986146]])

In [15]:
dump(svr,'svr.joblib')

['svr.joblib']