In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.tree import export_graphviz,plot_tree
from six import StringIO
from IPython.display import Image  
import pydotplus
import collections

In [3]:
df_final = pd.read_csv('./data/datosprocesados2.csv', delimiter=',', decimal='.', encoding='UTF-8')
df_final.head()

Unnamed: 0,IDDIST,cluster,res_platform,score,platform,polarity,num_score
0,14.0,4.0,2,5.0,2,5,0.8002
1,14.0,4.0,2,5.0,2,5,0.8888
2,14.0,4.0,2,1.0,2,1,0.98
3,13.0,5.0,1,4.0,1,3,0.5998
4,13.0,5.0,1,5.0,1,5,0.5325


In [4]:
valores_score = df_final['score'].value_counts()
print(valores_score)

score
5.0    389655
4.0    210019
3.0     90142
1.0     37463
2.0     27787
Name: count, dtype: int64


In [5]:
valores_polarity = df_final['polarity'].value_counts()
print(valores_polarity)

polarity
5    406391
4    186836
3     70597
1     48633
2     42609
Name: count, dtype: int64


In [6]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 755066 entries, 0 to 755065
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   IDDIST        701056 non-null  float64
 1   cluster       752017 non-null  float64
 2   res_platform  755066 non-null  int64  
 3   score         755066 non-null  float64
 4   platform      755066 non-null  int64  
 5   polarity      755066 non-null  int64  
 6   num_score     755066 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 40.3 MB


In [7]:
df_final = df_final.dropna(subset=['IDDIST', 'cluster'])

In [8]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 698143 entries, 0 to 755065
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   IDDIST        698143 non-null  float64
 1   cluster       698143 non-null  float64
 2   res_platform  698143 non-null  int64  
 3   score         698143 non-null  float64
 4   platform      698143 non-null  int64  
 5   polarity      698143 non-null  int64  
 6   num_score     698143 non-null  float64
dtypes: float64(4), int64(3)
memory usage: 42.6 MB


In [9]:
X = df_final.drop('score', axis=1)  
y = df_final['score']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame

column_names = X_train.columns.tolist()

# Escalar
scaler = StandardScaler()

X_train_esc = scaler.fit_transform(X_train)
X_train = DataFrame(X_train_esc, columns=column_names)

X_test_esc = scaler.fit_transform(X_test)
X_test = DataFrame(X_test_esc, columns=column_names)

print ("scaler")

scaler


In [10]:
tree = DecisionTreeClassifier(criterion="entropy", random_state=0)
tree = tree.fit(X_train,y_train)

print('Accuracy para datos de entrenamiento: {:.3f}'.format(tree.score(X_train, y_train)))
print('Accuracy para datos de test: {:.3f}'.format(tree.score(X_test, y_test)))

Accuracy para datos de entrenamiento: 0.880
Accuracy para datos de test: 0.503


In [11]:
importances = tree.feature_importances_
feature_names = X_train.columns.tolist()

for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

IDDIST: 0.1956542074008384
cluster: 0.08841693023676364
res_platform: 0.0028613918514786975
platform: 0.002691237999974575
polarity: 0.25661802269732753
num_score: 0.45375820981361714


In [12]:
# numero de árboles: 5, 10, 20, 50 y 100
from sklearn.ensemble import RandomForestClassifier
#   []
n_arboles = {5, 10, 20, 50, 100}
particion = 'gini'
max_depth = 10
min_samples_split = 10 
min_samples_leaf = 2
max_feature = 'auto'

randomForest = [RandomForestClassifier(n_estimators = i, criterion= particion,
                                       max_depth = max_depth, min_samples_split = min_samples_split,
                                      min_samples_leaf = min_samples_leaf, max_features =None) for i in n_arboles]

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Entrenar el modelo
dicc_forest = {}
dicc_reporte = {}

for i, model in enumerate(randomForest):
    # Entrenar el modelo con datos de entrenamiento
    model.fit(X_train, y_train)

    # Realizar predicciones
    y_pred = model.predict(X_test)

    # Calcular y mostrar la precisión del modelo
    precision = accuracy_score(y_test, y_pred)
    matriz_conf = confusion_matrix(y_test, y_pred)
    reporte = classification_report(y_test, y_pred)
    
    dicc_forest[model.n_estimators] = matriz_conf
    dicc_reporte[model.n_estimators] = reporte
    print(i)

0
1
2
3
4


In [14]:
print(dicc_forest[20])

[[ 4089   330  1465    65   808]
 [ 1260   461  2424   139   821]
 [  744   357  7191  2324  6111]
 [  280    45  4529  8919 25361]
 [  418    31  1529  6197 63731]]


In [15]:
print(dicc_forest[100])

[[ 4093   339  1450    60   815]
 [ 1262   495  2387   128   833]
 [  743   387  7142  2207  6248]
 [  287    49  4497  8714 25587]
 [  427    30  1500  6000 63949]]


In [16]:
for j in n_arboles:
    precision = np.diagonal(dicc_forest[j]).sum() / dicc_forest[j].sum()
    print("Cantidad de arboles:",j, "-> Precision:", precision)

Cantidad de arboles: 50 -> Precision: 0.6042584276905227
Cantidad de arboles: 100 -> Precision: 0.6044088262466966
Cantidad de arboles: 5 -> Precision: 0.6041223528063654
Cantidad de arboles: 20 -> Precision: 0.6043945025746801
Cantidad de arboles: 10 -> Precision: 0.6039074977261171


In [17]:
print(dicc_reporte[100])

              precision    recall  f1-score   support

         1.0       0.60      0.61      0.60      6757
         2.0       0.38      0.10      0.15      5105
         3.0       0.42      0.43      0.42     16727
         4.0       0.51      0.22      0.31     39134
         5.0       0.66      0.89      0.76     71906

    accuracy                           0.60    139629
   macro avg       0.51      0.45      0.45    139629
weighted avg       0.57      0.60      0.56    139629



In [18]:
import numpy as np
from sklearn.metrics import confusion_matrix
import pandas as pd

def analyze_confusion_matrix(cm, labels):
    totals = cm.sum(axis=1)
    errors = []
    
    # Iterar sobre la matriz de confusión
    for true_class in range(len(labels)):
        for pred_class in range(len(labels)):
            if true_class != pred_class:
                count = cm[true_class, pred_class]
                percentage = (count / totals[true_class]) * 100
                errors.append({
                    'true_class': labels[true_class],
                    'predicted_class': labels[pred_class],
                    'count': count,
                    'percentage': percentage
                })
    
    # Convertir la lista de errores a un DataFrame y ordenarlo
    df_errors = pd.DataFrame(errors)
    df_errors_sorted = df_errors.sort_values('percentage', ascending=False)
    
    # Mostrar los resultados
    print("Predicciones incorrectas más frecuentes:")
    for _, row in df_errors_sorted.iterrows():
        print(f"Clase real: {row['true_class']}, "
              f"Predicción: {row['predicted_class']}, "
              f"Frecuencia: {row['count']}, "
              f"Porcentaje: {row['percentage']:.2f}%")

In [19]:
class_names = [str(c) for c in sorted(y_train.unique())]
analyze_confusion_matrix(dicc_forest[100], class_names)

Predicciones incorrectas más frecuentes:
Clase real: 4.0, Predicción: 5.0, Frecuencia: 25587, Porcentaje: 65.38%
Clase real: 2.0, Predicción: 3.0, Frecuencia: 2387, Porcentaje: 46.76%
Clase real: 3.0, Predicción: 5.0, Frecuencia: 6248, Porcentaje: 37.35%
Clase real: 2.0, Predicción: 1.0, Frecuencia: 1262, Porcentaje: 24.72%
Clase real: 1.0, Predicción: 3.0, Frecuencia: 1450, Porcentaje: 21.46%
Clase real: 2.0, Predicción: 5.0, Frecuencia: 833, Porcentaje: 16.32%
Clase real: 3.0, Predicción: 4.0, Frecuencia: 2207, Porcentaje: 13.19%
Clase real: 1.0, Predicción: 5.0, Frecuencia: 815, Porcentaje: 12.06%
Clase real: 4.0, Predicción: 3.0, Frecuencia: 4497, Porcentaje: 11.49%
Clase real: 5.0, Predicción: 4.0, Frecuencia: 6000, Porcentaje: 8.34%
Clase real: 1.0, Predicción: 2.0, Frecuencia: 339, Porcentaje: 5.02%
Clase real: 3.0, Predicción: 1.0, Frecuencia: 743, Porcentaje: 4.44%
Clase real: 2.0, Predicción: 4.0, Frecuencia: 128, Porcentaje: 2.51%
Clase real: 3.0, Predicción: 2.0, Frecuencia

In [16]:
df = df_final.head(100000)

In [17]:
from sklearn.model_selection import train_test_split

X, y = df.drop('score',axis=1),df['score']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [18]:
from sklearn import svm

#Cargamos también las librerías para hacer la matriz de confusión y la cross validation:

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold


In [19]:
from sklearn.svm import SVC

C = {0.1, 1, 10, 100, 1000}
kernel = 'rbf'
gamma = 'scale'

svc = [SVC(C=i, kernel=kernel, gamma=gamma) for i in C]
svc

[SVC(C=0.1), SVC(C=1), SVC(C=100), SVC(C=1000), SVC(C=10)]

In [20]:
dicc_svc = {}
dicc_reporte_svc = {}

for i, model in enumerate(svc):
    # Entrenar el modelo con datos de entrenamiento
    model.fit(X_train, y_train)

    # Realizar predicciones
    y_pred = model.predict(X_test)

    # Calcular y mostrar la precisión del modelo
    precision = accuracy_score(y_test, y_pred)
    matriz_conf = confusion_matrix(y_test, y_pred)
    reporte = classification_report(y_test, y_pred)
    
    dicc_svc[model.C] = matriz_conf
    dicc_reporte_svc[model.C] = reporte
    print(i)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


2
3
4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [21]:
for k in C:
    print("Modelo C = ", k)
    cnt_correcto = np.diagonal(dicc_svc[k]).sum()
    print(dicc_svc[k])
    print("Cantidad clasificados correctamente:", cnt_correcto)

Modelo C =  0.1
[[ 674    0  172   28   86]
 [ 312    0  208   95  146]
 [ 313    0  323  470 1388]
 [ 137    0  112  392 4975]
 [ 211    0   61  200 9697]]
Cantidad clasificados correctamente: 11086
Modelo C =  1
[[ 617    0  229   52   62]
 [ 217    0  303  144   97]
 [ 175    0  461  828 1030]
 [  99    0  150  979 4388]
 [ 191    0   81  566 9331]]
Cantidad clasificados correctamente: 11388
Modelo C =  100
[[ 617    0  229   74   40]
 [ 212    0  308  199   42]
 [ 173    0  463 1444  414]
 [  98    0  151 2805 2562]
 [ 191    0   81 2258 7639]]
Cantidad clasificados correctamente: 11524
Modelo C =  1000
[[ 616    0  230   73   41]
 [ 212    3  307  195   44]
 [ 170    0  528 1378  418]
 [  97    0  179 2777 2563]
 [ 183    0   89 2250 7647]]
Cantidad clasificados correctamente: 11571
Modelo C =  10
[[ 617    0  229   74   40]
 [ 214    0  306  199   42]
 [ 173    0  463 1442  416]
 [  98    0  151 2804 2563]
 [ 191    0   81 2254 7643]]
Cantidad clasificados correctamente: 11527


In [22]:
for k in C:
    precision = np.diagonal(dicc_svc[k]).sum() / dicc_svc[k].sum()
    print("Modelo C:",k , "-> Precision:", precision)
    print(dicc_reporte_svc[k])

Modelo C: 0.1 -> Precision: 0.5543
              precision    recall  f1-score   support

         1.0       0.41      0.70      0.52       960
         2.0       0.00      0.00      0.00       761
         3.0       0.37      0.13      0.19      2494
         4.0       0.33      0.07      0.12      5616
         5.0       0.60      0.95      0.73     10169

    accuracy                           0.55     20000
   macro avg       0.34      0.37      0.31     20000
weighted avg       0.46      0.55      0.45     20000

Modelo C: 1 -> Precision: 0.5694
              precision    recall  f1-score   support

         1.0       0.47      0.64      0.55       960
         2.0       0.00      0.00      0.00       761
         3.0       0.38      0.18      0.25      2494
         4.0       0.38      0.17      0.24      5616
         5.0       0.63      0.92      0.74     10169

    accuracy                           0.57     20000
   macro avg       0.37      0.38      0.36     20000
weighted 