In [2]:
# @title Dependencias e imports
!pip install contractions gensim

import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
import re
import string

from bs4 import BeautifulSoup
import contractions
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from gensim.parsing.preprocessing import remove_stopwords

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyasc

In [3]:
# @title Descargo los datos y pongo una semiila

# SEMILLA para que siempre de el mismo resultado
SEMILLA = 42
random.seed(SEMILLA)
np.random.seed(SEMILLA)
tf.random.set_seed(SEMILLA)

URL_BASE = '/kaggle/input/nlp-getting-started/'
df_train = pd.read_csv(URL_BASE + "train.csv")
df_test = pd.read_csv(URL_BASE + "test.csv")

In [4]:
# @title Limpieza y features

def remove_tweet_username(text):
    return re.sub(r'@[^\s]+', '', text)

def make_lower(text):
    return text.lower()

def cont_exp(text):
    return contractions.fix(text)

def make_string(text):
    return str(text)

def remove_url(text):
    return re.sub(
        r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))'
        r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',
        '',
        text
    )

def remove_email(text):
    return re.sub(
        r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',
        '',
        text,
        flags=re.IGNORECASE
    )

def remove_rt(text):
    text = str(text)
    return re.sub(r'\brt\b', '', text, flags=re.IGNORECASE).strip()

def remove_html(text):
    return BeautifulSoup(text, 'lxml').get_text().strip()

def remove_special_chars(text):
    text = re.sub(r'[^\w]+', ' ', text)
    text = ' '.join(text.split())
    return text

def resub(text):
    return re.sub(r"(.)\1{2,}", r"\1", text)

def get_clean_data(text):
    text = make_string(text)
    text = remove_url(text)
    text = remove_email(text)
    text = remove_html(text)
    text = make_lower(text)
    text = cont_exp(text)
    text = remove_rt(text)
    text = resub(text)
    text = remove_tweet_username(text)
    text = remove_special_chars(text)
    return text


df_train['text'] = df_train['text'].apply(get_clean_data)
df_test['text'] = df_test['text'].apply(get_clean_data)

#Sacamos las StopWords
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_test['text'] = df_test['text'].apply(remove_stopwords)

# Sacamos los digitos
df_train['text'] = df_train['text'].str.replace(r'\d+', '', regex=True)
df_test['text'] = df_test['text'].str.replace(r'\d+', '', regex=True)


In [19]:
# @title Split + Definicion del modelo y embedding
tfidf = TfidfVectorizer(
    min_df=2,
    max_df=0.95,
    ngram_range=(1,2),
    sublinear_tf=True
)

X = tfidf.fit_transform(df_train['text'])
y = df_train['target']


X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.25,
    random_state=SEMILLA,
    stratify=y
)

svc = LinearSVC(
    C=1.0,
    max_iter=5000
)

svc.fit(X_train, y_train)


In [20]:

from sklearn.metrics import (
    make_scorer,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report
)
pred = svc.predict(X_valid)

# F1
from sklearn.metrics import f1_score
f1 = f1_score(y_valid, pred, average='weighted')
print(f'F1: {f1}')

# La accuracy es una metrica que se mide en:
#Cantidad de aciertos / Totales
# Usando 0.5 de umbral
accuracy = accuracy_score(y_valid, pred)

# La precision es una metrica que se mide en:
# Cuantos Desastres predichos son de verdad desastres
# (Cantidad de 1s correctos / Cantidad de 1s) o lo mismo pero con los 0s
precision_1 = precision_score(y_valid, pred, pos_label=1)
precision_0 = precision_score(y_valid, pred, pos_label=0)

# Es el Analogo de la Precision:
# Recall: Cuantos de los que son de una clase predije como de esa clase?
# 1s Recuperados / Cantidad total de 1s

recall_1 = recall_score(y_valid, pred, pos_label=1)
recall_0 = recall_score(y_valid, pred, pos_label=0)

print("Accuracy:", accuracy)
print("Precision_0:", precision_0)
print("Precision_1:", precision_1)
print("Recall_0:", recall_0)
print("Recall_1:", recall_1)

F1: 0.7916277772491794
Accuracy: 0.7925420168067226
Precision_0: 0.8044052863436123
Precision_1: 0.7750325097529259
Recall_0: 0.8406998158379374
Recall_1: 0.7286063569682152


In [21]:

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    'C': uniform(0.1, 10),
    'loss': ['hinge', 'squared_hinge'],
    'tol': [1e-3, 1e-4, 1e-5]
}

random_search = RandomizedSearchCV(
    svc,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1',
    cv=5,
    verbose=2,
    random_state=SEMILLA,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_svc = random_search.best_estimator_

pred = best_svc.predict(X_valid)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [22]:

from sklearn.metrics import (
    make_scorer,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
    classification_report
)

# F1
from sklearn.metrics import f1_score
f1 = f1_score(y_valid, pred, average='weighted')
print(f'F1: {f1}')

# La accuracy es una metrica que se mide en:
#Cantidad de aciertos / Totales
# Usando 0.5 de umbral
accuracy = accuracy_score(y_valid, pred)

# La precision es una metrica que se mide en:
# Cuantos Desastres predichos son de verdad desastres
# (Cantidad de 1s correctos / Cantidad de 1s) o lo mismo pero con los 0s
precision_1 = precision_score(y_valid, pred, pos_label=1)
precision_0 = precision_score(y_valid, pred, pos_label=0)

# Es el Analogo de la Precision:
# Recall: Cuantos de los que son de una clase predije como de esa clase?
# 1s Recuperados / Cantidad total de 1s

recall_1 = recall_score(y_valid, pred, pos_label=1)
recall_0 = recall_score(y_valid, pred, pos_label=0)

print("Accuracy:", accuracy)
print("Precision_0:", precision_0)
print("Precision_1:", precision_1)
print("Recall_0:", recall_0)
print("Recall_1:", recall_1)

F1: 0.8047889863015913
Accuracy: 0.8067226890756303
Precision_0: 0.8037225042301185
Precision_1: 0.8116343490304709
Recall_0: 0.8747697974217311
Recall_1: 0.7163814180929096


In [None]:
X_test_final = tfidf.transform(df_test['text'])
df_submission = pd.DataFrame({
    'id': df_test['id'],
    'target': best_svc.predict(X_test_final)
})
df_submission.to_csv("submission_svc_randomized.csv", index=False)
