In [12]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [13]:
dataset_varianza = pd.read_csv('../CSVs/Dataset_varianza.csv')
dataset_varianza

Unnamed: 0,Temporada,Dif PTOS,Dif POS,Dif FORMA,Dif PTS L/V,Dif G AF,Dif G EC,Target
0,2009-10,-0.600,7,-2.0,0.133,0.000,-1.800,E
1,2009-10,-0.900,7,-8.0,-0.267,-0.800,-0.200,V
2,2009-10,0.500,-4,1.0,0.733,1.600,-1.200,L
3,2009-10,0.100,-3,0.0,0.333,0.000,-0.800,V
4,2009-10,-1.000,12,-6.0,-0.066,-0.400,0.000,E
...,...,...,...,...,...,...,...,...
2335,2017-18,-0.543,9,-9.0,-0.059,-0.353,-0.589,L
2336,2017-18,0.486,-7,-3.0,0.353,0.882,-1.177,E
2337,2017-18,-0.371,2,2.0,0.078,0.235,-0.059,L
2338,2017-18,0.143,-3,-7.0,0.313,0.588,-0.353,L


In [14]:
y = dataset_varianza['Target']
X = dataset_varianza.iloc[:,1:-1]

In [15]:
validation_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X,
                                                                    y,
                                                                    test_size=validation_size,
                                                                    random_state=seed)

In [16]:
model = linear_model.LogisticRegression(max_iter = 1000)

In [17]:
name='Logistic Regression'
kfold = model_selection.StratifiedKFold(n_splits=10) #Parte los datos en 10 trozos para usar validación cruzada / cross validation
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')

msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(cv_results)
print(msg)

[0.55319149 0.56382979 0.55080214 0.50802139 0.58823529 0.52941176
 0.54010695 0.5828877  0.54545455 0.55614973]
Logistic Regression: 0.551809 (0.022484)


In [18]:
model.fit(X_train, Y_train)

LogisticRegression(max_iter=1000)

In [19]:
model.score(X_test, Y_test)

0.5470085470085471

In [None]:
import pickle

with open('my_model_va', 'wb') as archivo_salida:
    pickle.dump(model, archivo_salida)

In [20]:
predicions_proba = model.predict_proba(X_test)
predicions_proba[-6]

array([0.27278067, 0.20754242, 0.51967691])

In [13]:
predicions = model.predict(X_test)
predicions

array(['L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'V', 'L', 'V', 'L', 'L', 'L', 'L', 'V', 'L', 'V',
       'L', 'L', 'L', 'L', 'L', 'V', 'L', 'L', 'L', 'V', 'L', 'L', 'L',
       'V', 'L', 'L', 'V', 'V', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'V', 'L', 'L', 'V', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'V',
       'L', 'V', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'V', 'L',
       'L', 'L', 'L', 'L', 'V', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'V', 'L', 'L',
       'L', 'L', 'L', 'L', 'L', 'V', 'L', 'L', 'L', 'L', 'L', 'L', 'V',
       'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'V', 'L', 'L', 'L', 'L',
       'L', 'V', 'L', 'L', 'L', 'L', 'L', 'V', 'V', 'L', 'L', 'L', 'L',
       'L', 'L', 'L', 'V', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'L', 'V',
       'L', 'V', 'L', 'V', 'L', 'L', 'L', 'L', 'L', 'L', 'V', 'V

In [14]:
y_test_list = list(Y_test)
y_test_list

['L',
 'L',
 'E',
 'L',
 'E',
 'E',
 'E',
 'E',
 'E',
 'L',
 'L',
 'E',
 'E',
 'L',
 'V',
 'V',
 'V',
 'E',
 'V',
 'E',
 'L',
 'V',
 'V',
 'L',
 'V',
 'V',
 'L',
 'L',
 'L',
 'V',
 'E',
 'V',
 'L',
 'L',
 'E',
 'V',
 'L',
 'V',
 'L',
 'V',
 'L',
 'L',
 'V',
 'V',
 'V',
 'V',
 'L',
 'E',
 'V',
 'L',
 'L',
 'L',
 'L',
 'E',
 'E',
 'L',
 'L',
 'L',
 'V',
 'L',
 'V',
 'E',
 'V',
 'E',
 'L',
 'V',
 'E',
 'L',
 'V',
 'V',
 'L',
 'L',
 'L',
 'L',
 'L',
 'E',
 'L',
 'V',
 'L',
 'E',
 'V',
 'L',
 'L',
 'V',
 'L',
 'E',
 'L',
 'L',
 'V',
 'E',
 'L',
 'L',
 'E',
 'L',
 'L',
 'V',
 'V',
 'E',
 'L',
 'L',
 'V',
 'L',
 'L',
 'L',
 'E',
 'L',
 'E',
 'L',
 'L',
 'V',
 'L',
 'L',
 'L',
 'V',
 'L',
 'L',
 'L',
 'E',
 'L',
 'E',
 'V',
 'V',
 'V',
 'L',
 'V',
 'L',
 'L',
 'L',
 'L',
 'L',
 'E',
 'L',
 'V',
 'L',
 'L',
 'E',
 'L',
 'L',
 'V',
 'V',
 'L',
 'L',
 'L',
 'E',
 'V',
 'L',
 'V',
 'L',
 'V',
 'E',
 'V',
 'E',
 'L',
 'L',
 'L',
 'L',
 'E',
 'L',
 'L',
 'E',
 'L',
 'L',
 'L',
 'V',
 'V',
 'V',
 'E'

In [15]:
resultados_Y = pd.DataFrame((predicions_proba),columns=[model.classes_])
resultados_Y['Real']=y_test_list
resultados_Y['Predictions'] = predicions
resultados_Y

Unnamed: 0,E,L,V,Real,Predictions
0,0.169459,0.702575,0.127966,L,L
1,0.189555,0.704072,0.106373,L,L
2,0.282266,0.503624,0.214110,E,L
3,0.185633,0.633835,0.180532,L,L
4,0.243063,0.527174,0.229763,E,L
...,...,...,...,...,...
463,0.157287,0.725295,0.117418,L,L
464,0.180577,0.103065,0.716357,V,V
465,0.210067,0.556395,0.233538,V,L
466,0.238089,0.593994,0.167917,L,L


In [16]:
resultados_Y.iloc[0,0]

0.16945923338413

In [17]:
resultados_Y.columns

MultiIndex([(          'E',),
            (          'L',),
            (          'V',),
            (       'Real',),
            ('Predictions',)],
           )

In [18]:
# 1ª Opción Empate
pronóstico_lista_x = []
for i in range(len(resultados_Y)):
    E = resultados_Y.iloc[i,0]
    L = resultados_Y.iloc[i,1]
    V = resultados_Y.iloc[i,2]
    lista = [E,L,V]
    maximo = (max(lista))
    if maximo < 0.6: 
        if lista.index(maximo)==1:
            pronóstico = 'LE'
            pronóstico_lista_x.append(pronóstico)
        elif lista.index(maximo)==2:
            pronóstico = 'EV'
            pronóstico_lista_x.append(pronóstico)
    else: 
        if lista.index(maximo) ==1: 
            pronóstico = 'L'
            pronóstico_lista_x.append(pronóstico)

        elif lista.index(maximo) ==2: 
            pronóstico = 'V'
            pronóstico_lista_x.append(pronóstico)

        elif lista.index(maximo) ==0: 
            pronóstico = 'E'
            pronóstico_lista_x.append(pronóstico)
    

In [19]:
#pronóstico_lista_2aopción
pronóstico_dosopc = []
for i in range(len(resultados_Y)):
    E = resultados_Y.iloc[i,0]
    L = resultados_Y.iloc[i,1]
    V = resultados_Y.iloc[i,2]
    lista = [E,L,V]
    maximo = (max(lista))
    maximo2 = sorted(lista)[1]

    if maximo < 0.6: 
        if lista.index(maximo)==1 and lista.index(maximo2)==2:
            pronóstico = 'LV'
            pronóstico_dosopc.append(pronóstico)
        elif lista.index(maximo)==1 and lista.index(maximo2)==0:
            pronóstico = 'LE'
            pronóstico_dosopc.append(pronóstico)
        elif lista.index(maximo)==2 and lista.index(maximo2)==1:
            pronóstico = 'VL'
            pronóstico_dosopc.append(pronóstico)
        elif lista.index(maximo)==2 and lista.index(maximo2)==0:
            pronóstico = 'VE'
            pronóstico_dosopc.append(pronóstico)
    else: 
        if lista.index(maximo) ==1: 
            pronóstico = 'L'
            pronóstico_dosopc.append(pronóstico)

        elif lista.index(maximo) ==2: 
            pronóstico = 'V'
            pronóstico_dosopc.append(pronóstico)

        elif lista.index(maximo) ==0: 
            pronóstico = 'E'
            pronóstico_dosopc.append(pronóstico)

In [20]:
resultados_Y['Doble Opcion1'] = pronóstico_lista_x
resultados_Y['Doble Opcion2'] = pronóstico_dosopc

In [21]:
resultados_Y['Doble Opcion1'].value_counts()

(Doble Opcion1,)
LE                  242
L                   141
EV                   59
V                    26
dtype: int64

In [22]:
resultados_Y['Doble Opcion2'].value_counts()

(Doble Opcion2,)
L                   141
LV                  137
LE                  105
VL                   31
VE                   28
V                    26
dtype: int64

In [109]:
resultados_Y

Unnamed: 0,E,L,V,Real,Predictions,Doble Opcion1,Doble Opcion2
0,0.169459,0.702575,0.127966,L,L,L,L
1,0.189555,0.704072,0.106373,L,L,L,L
2,0.282266,0.503624,0.214110,E,L,LE,LE
3,0.185633,0.633835,0.180532,L,L,L,L
4,0.243063,0.527174,0.229763,E,L,LE,LE
...,...,...,...,...,...,...,...
463,0.157287,0.725295,0.117418,L,L,L,L
464,0.180577,0.103065,0.716357,V,V,V,V
465,0.210067,0.556395,0.233538,V,L,LE,LV
466,0.238089,0.593994,0.167917,L,L,LE,LE


In [120]:
lista_acierto_dobles = []

for i in range(len(resultados_Y)): 
    doble_opcion = resultados_Y.iloc[i,6]
    if resultados_Y.iloc[i,3] in doble_opcion: 
        lista_acierto_dobles.append(1)
    else: 
        lista_acierto_dobles.append(0)



In [121]:
resultados_Y['Aciertos'] = lista_acierto_dobles
resultados_Y['Aciertos'].value_counts()

(Aciertos,)
1              346
0              122
dtype: int64

In [122]:
resultados_Y

Unnamed: 0,E,L,V,Real,Predictions,Doble Opcion1,Doble Opcion2,Aciertos,Apuntes Real
0,0.169459,0.702575,0.127966,L,L,L,L,1,L
1,0.189555,0.704072,0.106373,L,L,L,L,1,L
2,0.282266,0.503624,0.214110,E,L,LE,LE,1,LE
3,0.185633,0.633835,0.180532,L,L,L,L,1,L
4,0.243063,0.527174,0.229763,E,L,LE,LE,1,LE
...,...,...,...,...,...,...,...,...,...
463,0.157287,0.725295,0.117418,L,L,L,L,1,L
464,0.180577,0.103065,0.716357,V,V,V,V,1,V
465,0.210067,0.556395,0.233538,V,L,LE,LV,1,VA
466,0.238089,0.593994,0.167917,L,L,LE,LE,1,LE


In [127]:
lista_apunte_real = []
for i in range(len(resultados_Y)):
    if resultados_Y.iloc[i,7]==0:
        apuntes_real = 'VA'
        lista_apunte_real.append(apuntes_real)
    elif resultados_Y.iloc[i,7]==1:
        apuntes_real = resultados_Y.iloc[i,6]
        lista_apunte_real.append(apuntes_real)

resultados_Y['Apuntes Real'] = lista_apunte_real
resultados_Y

Unnamed: 0,E,L,V,Real,Predictions,Doble Opcion1,Doble Opcion2,Aciertos,Apuntes Real
0,0.169459,0.702575,0.127966,L,L,L,L,1,L
1,0.189555,0.704072,0.106373,L,L,L,L,1,L
2,0.282266,0.503624,0.214110,E,L,LE,LE,1,LE
3,0.185633,0.633835,0.180532,L,L,L,L,1,L
4,0.243063,0.527174,0.229763,E,L,LE,LE,1,LE
...,...,...,...,...,...,...,...,...,...
463,0.157287,0.725295,0.117418,L,L,L,L,1,L
464,0.180577,0.103065,0.716357,V,V,V,V,1,V
465,0.210067,0.556395,0.233538,V,L,LE,LV,1,LV
466,0.238089,0.593994,0.167917,L,L,LE,LE,1,LE


In [128]:
resultados_Y['Apuntes Real'].value_counts()

(Apuntes Real,)
VA                 122
LV                 101
L                   99
LE                  80
VL                  23
V                   22
VE                  21
dtype: int64

In [129]:
1-122/468

0.7393162393162394

In [147]:
resultados_Y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468 entries, 0 to 467
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   (E,)              468 non-null    float64
 1   (L,)              468 non-null    float64
 2   (V,)              468 non-null    float64
 3   (Real,)           468 non-null    object 
 4   (Predictions,)    468 non-null    object 
 5   (Doble Opcion1,)  468 non-null    object 
 6   (Doble Opcion2,)  468 non-null    object 
 7   (Aciertos,)       468 non-null    int64  
 8   (Apuntes Real,)   468 non-null    object 
dtypes: float64(3), int64(1), object(5)
memory usage: 33.0+ KB


In [141]:
# Errores
VA_tabla = resultados_Y.copy()
VA_tabla.to_csv('VA_tabla.csv',index=False)

In [152]:
resultados_Y['Doble Opcion2'].value_counts()

(Doble Opcion2,)
L                   141
LV                  137
LE                  105
VL                   31
VE                   28
V                    26
dtype: int64

In [164]:
Va_tabla1 = pd.read_csv('VA_tabla.csv')
Va_tabla1 = Va_tabla1[(Va_tabla1['Apuntes Real'] == 'VA')]
Va_tabla1

Unnamed: 0,E,L,V,Real,Predictions,Doble Opcion1,Doble Opcion2,Aciertos,Apuntes Real
5,0.219310,0.543440,0.237250,E,L,LE,LV,0,VA
8,0.149325,0.783481,0.067193,E,L,L,L,0,VA
12,0.229751,0.441899,0.328350,E,L,LE,LV,0,VA
17,0.212441,0.487868,0.299691,E,L,LE,LV,0,VA
19,0.239739,0.470172,0.290089,E,L,LE,LV,0,VA
...,...,...,...,...,...,...,...,...,...
451,0.189255,0.658644,0.152101,V,L,L,L,0,VA
454,0.235533,0.515997,0.248470,E,L,LE,LV,0,VA
460,0.262078,0.521896,0.216025,V,L,LE,LE,0,VA
461,0.222707,0.522267,0.255026,E,L,LE,LV,0,VA


In [160]:
Va_tabla1['Doble Opcion2'].value_counts()

L     42
LV    36
LE    25
VL     8
VE     7
V      4
Name: Doble Opcion2, dtype: int64

In [159]:
Reparto_errores = pd.DataFrame(list(zip(resultados_Y['Doble Opcion2'].value_counts(),Va_tabla1['Doble Opcion2'].value_counts())),columns=['Reparto Pred','Nª Errores'])
Reparto_errores['% Errores'] = round(Reparto_errores['Nª Errores']/Reparto_errores['Reparto Pred'],2)
Reparto_errores

Unnamed: 0,Reparto Pred,Nª Errores,% Errores
0,141,42,0.3
1,137,36,0.26
2,105,25,0.24
3,31,8,0.26
4,28,7,0.25
5,26,4,0.15


In [155]:
36*4

144

In [130]:
from sklearn.metrics import accuracy_score

acierto = accuracy_score(resultados_Y['Apuntes Real'], resultados_Y['Doble Opcion2'])
acierto

0.7393162393162394

# QUE ME QUEDA, HACER UNA CRIBA... DE INFERIOR A 60% COGER X Y VER % ACIERTO

In [21]:
model.classes_

array(['E', 'L', 'V'], dtype=object)