# Se procede a limpiar los datos y probar distintos modelos

In [17]:
import pandas as pd
import re, string

In [18]:
data = pd.read_csv('data_files/Datos.csv')
data = data.drop(['i'],axis=1)
# Se aleatoriza la muestra
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Texto,class
0,14 Jun 2022\n¿Qué está pasando con el precio d...,0
1,Marcel Pechman\n27 Jun 2022\nLas perspectivas ...,0
2,"09 Jun 2022\nBitcoin: ¿""Refugio seguro"" o ""act...",0
3,Marcel Pechman\n13 Jun 2022\nDatos de los deri...,0
4,William Suberg\n24 May 2022\nEl precio de bitc...,1


In [19]:
# Normalización del texto
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

data['Texto'] = data['Texto'].apply(wordopt)
data.head()

Unnamed: 0,Texto,class
0,jun qué está pasando con el precio de bitco...,0
1,marcel pechman jun las perspectivas del prec...,0
2,jun bitcoin refugio seguro o activo de ...,0
3,marcel pechman jun datos de los derivados de...,0
4,william suberg may el precio de bitcoin vuel...,1


In [21]:
# Creación de datos de prueba y entrenamiento
from sklearn.model_selection import train_test_split

x = data["Texto"]
y = data["class"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [22]:
# Conversión de texto a vectores
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [60]:
data = list(xv_test.toarray())

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0.        , 0.15749031, 0.        , ..., 0.        , 0.        ,
        0.        ]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([

# Implementación de modelos

In [23]:
from sklearn.metrics import classification_report

## Modelo 1

In [89]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(random_state = 0, n_estimators=500, loss='log_loss',max_depth=3,learning_rate=0.05,min_weight_fraction_leaf=0.5)
GBC.fit(xv_train, y_train)

In [90]:
pred_gbc = GBC.predict(xv_test)
GBC.score(xv_test, y_test)

0.4444444444444444

In [44]:
print(classification_report(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.38      0.55      0.44        11
           1       0.55      0.38      0.44        16

    accuracy                           0.44        27
   macro avg       0.46      0.46      0.44        27
weighted avg       0.48      0.44      0.44        27



## Modelo 2

In [41]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state=0,penalty = 'l2')
LR.fit(xv_train, y_train)

In [42]:
pred_lr = LR.predict(xv_test)
LR.score(xv_test, y_test)

0.4074074074074074

In [106]:
print(classification_report(y_test, pred_lr,zero_division=1))

              precision    recall  f1-score   support

           0       0.41      1.00      0.58        11
           1       1.00      0.00      0.00        16

    accuracy                           0.41        27
   macro avg       0.70      0.50      0.29        27
weighted avg       0.76      0.41      0.24        27



## Modelo 3

In [82]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(criterion='entropy',min_weight_fraction_leaf=0.5)
DTC.fit(xv_train, y_train)

In [83]:
pred_dtc = DTC.predict(xv_test)
DTC.score(xv_test, y_test)

0.5555555555555556

In [84]:
print(classification_report(y_test, pred_dtc))

              precision    recall  f1-score   support

           0       0.44      0.36      0.40        11
           1       0.61      0.69      0.65        16

    accuracy                           0.56        27
   macro avg       0.53      0.53      0.52        27
weighted avg       0.54      0.56      0.55        27



## Modelo 4

In [109]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=500,criterion='entropy',min_weight_fraction_leaf=0.5,min_samples_leaf=2)
RFC.fit(xv_train,y_train)

In [110]:
pred_rfc= RFC.predict(xv_test)
RFC.score(xv_test,y_test)

0.4074074074074074

In [105]:
print(classification_report(y_test, pred_rfc,zero_division=1))

              precision    recall  f1-score   support

           0       0.41      1.00      0.58        11
           1       1.00      0.00      0.00        16

    accuracy                           0.41        27
   macro avg       0.70      0.50      0.29        27
weighted avg       0.76      0.41      0.24        27



# Se exporta el mejor modelo

In [111]:
from joblib import dump
dump(DTC,'modelos/DTClassifier.joblib')

['modelos/DTClassifier.joblib']