# One-Hot-encoding

## Importación de librerias

In [18]:
import pandas as pd
import numpy as np
import re as re
import string
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

## Carga de datos

In [19]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Limpieza de datos

### Definicion de funciones auxiliares

In [20]:
#Eliminar numeros de un texto
def eliminar_numeros(text):
    return re.sub("\d+", "",text)

#Eliminar puntuacion
def eliminar_puntuacion(text):
    return re.sub(r'[^\w\s]','',text)

#Pasar letras a minusculas
def minusculas(text):
    return text.lower()

#Eliminar caracteres especiales
def eliminar_caracteres(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '',text)

#Eliminar urls
def eliminar_url(text):
    url_reg = re.compile(r'https?://\S+|www\.\S+')
    return url_reg.sub(r'',text)

### Limpieza

In [22]:
for data in [test,train]:
    data['text'] = data['text'].apply(lambda x: eliminar_puntuacion(x))
    data['text'] = data['text'].apply(lambda x: minusculas(x))
    data['text'] = data['text'].apply(lambda x: eliminar_numeros(x))
    data['text'] = data['text'].apply(lambda x: eliminar_caracteres(x))
    data['text'] = data['text'].apply(lambda x: remove_stopwords(x))
    data['text'] = data['text'].apply(lambda x: eliminar_url(x))

## Particionando los datos

In [23]:
X_train,X_test,y_train,y_test= train_test_split(train.text, train.target, test_size=0.32,random_state=20)

## Función de tokenización

In [24]:
def convertir_en_tokens(texto):
    tokens = []
    for palabra in texto.split():
        if not palabra in tokens:
            tokens.append(palabra)
    return tokens

## One-Hot-encoding

In [25]:
vectorizer = CountVectorizer( tokenizer = convertir_en_tokens, binary = True)
X_train = vectorizer.fit_transform(X_train)

In [26]:
X_test = vectorizer.transform(X_test)

In [27]:
clasificador = LinearSVC()
clasificador.fit(X_train,y_train)

LinearSVC()

## Pruebas

### Predicciones sobre el set de training

In [28]:
predicciones = clasificador.predict(X_train)

In [29]:
accuracy = accuracy_score (y_train,predicciones)
print (f"Accuracy: {accuracy:.4%}")

Accuracy: 99.5750%


### Predicciones sobre el set de test

In [30]:
predicciones = clasificador.predict(X_test)

In [31]:
accuracy = accuracy_score (y_test,predicciones)
print (f"Accuracy: {accuracy:.4%}")

Accuracy: 78.1699%


## Preparando el submit de Kaggle

### Realizando Predicciones

In [34]:
submit = pd.DataFrame(test['id'],columns=['id'])
texts = vectorizer.transform(test['text'])
predicciones = clasificador.predict(texts)
submit['target'] = predicciones
#submit.to_csv('SUBMITS/submission-onehot.csv',index=False)