In [19]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
path = '/content/drive/MyDrive/TFG/textos_procesados_bien.csv'
df_procesado = pd.read_csv(path, usecols=['ODS', 'Texto', 'Tokens'])

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle 

In [22]:
# Variables X e Y para entrenar y probar el algoritmo SVM 
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df_procesado['Tokens'],df_procesado['ODS'],test_size=0.3, shuffle = True, random_state=0)

# TF-IDF
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df_procesado['Tokens'])

Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [23]:
# Algoritmo SVM 
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', random_state=0, probability=True)
# Entrenamiento del algoritmo 
SVM.fit(Train_X_Tfidf,Train_Y)

# Evaluacion del modelo 
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print(predictions_SVM)

print(confusion_matrix(Test_Y, predictions_SVM))
print(classification_report(Test_Y, predictions_SVM))

# Se guarda el modelo 
filename = 'final_model.sav'
pickle.dump(SVM, open(filename, 'wb'))

filename2 = 'tfyidf_model.sav'
pickle.dump(Tfidf_vect, open(filename2, 'wb'))

SVM Accuracy Score ->  54.67023769668564
[13  6  3 ...  9 12 16]
[[ 23  17  15   2   2   2   6   6   0  10   0   1   0   0   0   2  16]
 [  8  63   8   1   1   0   1   1   0   0   0   3   1   0   0   2   4]
 [  6  10 347   4   2   0   3  15   1  12   1   2   4   0   0   1  21]
 [  3   1  12  90   4   1   0  10   6   5   0   5   2   0   0   4  14]
 [  1   0   9   1 110   1   1   2   1   4   0   0   0   0   0   2   3]
 [  0   0   2   0   0  18   0   0   2   0   0   1   3   1   0   0   0]
 [  2   0   2   0   0   0  58   1   1   0   3   6  21   0   0   0   2]
 [  2   1  35  14  10   0   3 202  12  26   1  11   4   0   0  28  35]
 [  0   0   9   5   0   1   4  16  42   3   4  10   1   0   0   3  15]
 [  7   1  17   8   8   0   3  21   3 146   1   2   1   0   0   1  32]
 [  1   0   7   1   2   0   6   8   7   3  25  14  17   0   0   5  17]
 [  1   1  15   0   1   1   9  15   2   1   7 106  32   0   0  18  12]
 [  0   0   6   3   2   1  14   6   0   0   8  30 124   2   1   4   9]
 [  0   0   

In [24]:
# Algoritmo SVM utilizando cross-validation
from sklearn.model_selection import cross_val_score

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', random_state=0)
scores = cross_val_score(SVM, Test_X_Tfidf, Test_Y, cv=10)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
print(scores)

0.52 accuracy with a standard deviation of 0.03
[0.5083612  0.55518395 0.49498328 0.53177258 0.52842809 0.50501672
 0.47491639 0.5738255  0.46979866 0.52684564]


In [25]:
# Prueba del algoritmo SVM utilizando las variables de train para evaluar el modelo 
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', random_state=0)
SVM.fit(Train_X_Tfidf,Train_Y)

predictions_SVM = SVM.predict(Train_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Train_Y)*100)

print(confusion_matrix(Train_Y, predictions_SVM))
print(classification_report(Train_Y, predictions_SVM))

SVM Accuracy Score ->  83.56537964690685
[[ 171    8   15    2    0    1    1    4    0    6    0    0    1    0
     0    4    9]
 [   3  209    8    0    0    1    0    0    0    0    1    0    0    0
     0    0    4]
 [   2    4 1015    7    7    1    1    5    0    7    0    3    2    0
     0    2    5]
 [   3    2   12  294    1    1    1    8    1    8    0    3    0    0
     0    2    6]
 [   0    0    1    0  278    0    1    3    0    3    0    0    1    0
     0    1    1]
 [   0    0    1    0    0   79    0    0    0    0    0    2    1    0
     0    1    1]
 [   0    1    3    2    0    0  204    1    2    0    5    3   17    0
     0    1    0]
 [   2    0   22    7   10    0    4  701    5   37    1    5    7    0
     0   34   25]
 [   0    0    8    6    1    0    0   20  211    1    3   11    9    0
     0   14   11]
 [   7    0   15    7   15    0    1   19    0  462    0    2    0    0
     0    4   15]
 [   2    1    4    4    0    1   10   14    5    8  195   