## Importar las librerias necesarias

In [None]:
import pandas as pd
import os, re, html, csv
import numpy as np

In [None]:
!pip install --upgrade xlrd 



## Importar los datos

In [None]:
df =pd.read_excel('/content/delitos_odio.xls')  

## Creamos una columna que aune los titulos, sub titulos y noticias

In [None]:
df['all_data'] =df['titulos'] + " " + df['sub_titulos'] + " " + df['noticias']

In [None]:
df['all_data'] =df['all_data'].str.lower()

In [None]:
df.head()

Unnamed: 0,titulos,sub_titulos,noticias,delito_odio,all_data
0,12 años de cárcel: el castigo que piden por la...,La acusación le atribuye la autoría de un deli...,"Bilal M., detenido por agredir y lesionar a un...",1,12 años de cárcel: el castigo que piden por la...
1,Anelka celebró un gol con un gesto considerado...,"Realizó la ""quenelle"", una expresión del humor...",El futbolista francés Nicolas Anelka avivó hoy...,1,anelka celebró un gol con un gesto considerado...
2,Beatriz de Vicente explica por qué las denunci...,La broma de Juan del Val sobre cómo algunos pa...,Juan del Val ha sido denunciado ante el Defens...,1,beatriz de vicente explica por qué las denunci...
3,La Justicia confirma el archivo de la querella...,La Audiencia no ve delito de injurias y entien...,La Audiencia Provincial de Madrid ha rechazado...,1,la justicia confirma el archivo de la querella...
4,La Fiscalía denuncia a la falangista Isabel Pe...,La joven ya estaba siendo investigada por sus ...,La Fiscalía de Madrid ha interpuesto denuncia ...,1,la fiscalía denuncia a la falangista isabel pe...


## Creamos la variable objetivo

In [None]:
df =df.replace({1:'odio',0:'no_odio'})
df['delito_odio'] = df['delito_odio'].astype("category")
df_y = df['delito_odio'].cat.codes

In [None]:
labels = df.delito_odio.cat.categories.tolist()
labels_codes = {}
for i, label in enumerate(labels):
    labels_codes[label] = i

## Creamos una funcion para evaluar los modelos aplicando cross validation

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_validate

precision_neg = metrics.make_scorer(
    metrics.precision_score,
    average='micro',
    labels=[labels_codes['no_odio']])
precision_pos = metrics.make_scorer(
    metrics.precision_score,
    average='micro',
    labels=[labels_codes['odio']])
recall_neg = metrics.make_scorer(
    metrics.recall_score,
    average='micro',
    labels=[labels_codes['no_odio']])
recall_pos = metrics.make_scorer(
    metrics.recall_score,
    average='micro',
    labels=[labels_codes['odio']])


def evaluate_model(model, features, labels, cv=10, fit_params=None):
    scores = cross_validate(
        model,
        features,
        labels,
        cv=cv,
        fit_params=fit_params,
        scoring={
            'recall_macro': 'recall_macro',
            'accuracy': 'accuracy',
            'recall_neg': recall_neg,
            'recall_pos': recall_pos,
            'precision_neg': precision_neg,
            'precision_pos':precision_pos
        },
        n_jobs=-1,
    )

    results = pd.DataFrame(scores).drop(['fit_time', 'score_time'], axis=1)
    results.columns = pd.MultiIndex.from_tuples([c.split('_', maxsplit=1) for c in results.columns])    
    summary = results.describe()
    results = results.append(summary.loc[['mean', 'std']])
    
    def custom_style(row):
        color = 'white'
        if row.name == 'mean':
            color = 'yellow'
        return ['background-color: %s' % color]*len(row.values)
    results = results[sorted(results.columns, key=lambda x: x[0], reverse=True)]
    results = results.style.apply(custom_style, axis=1)
        
    return results

## Aplicamos tecnicas de limpieza de datos (NPL)

In [None]:
!python3 -m spacy download es

Collecting es_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2 MB)
[K     |████████████████████████████████| 16.2 MB 4.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/es_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/es
You can now load the model via spacy.load('es')


In [None]:
import spacy
import nltk
import re
nltk.download('stopwords')
nlp = spacy.load('es')


def delete_tildes(s):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
         ("ñ", "n"),
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s.strip()
def remove_stopwords(text):
    text = [word.strip() for word in text if word not in stopword_es]
    return text

sentences = df['all_data'].apply(lambda sen: delete_tildes(sen))
sentences = sentences.apply(lambda L: re.sub('[^a-zA-Z]+', ' ', L).strip())
stopword_es = nltk.corpus.stopwords.words('spanish')
noticia_tokenizada =  sentences.apply(lambda msg: [token.text.strip() for token in nlp(msg)])
noticia_tokenizada = noticia_tokenizada.apply(lambda x: remove_stopwords(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
np.save('noticias_token', noticia_tokenizada)

## Creamos la clase BagOfWords que nos permitira aplicar modelos a nuestro conjunto de datos de manera eficiente

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator
import collections
from collections import Counter
from sklearn.base import TransformerMixin

class BagOfWords(BaseEstimator, TransformerMixin):
    def __init__(self, min_frequency=1, clip_counts=False, use_tfidf=False):
        self.min_frequency = min_frequency 
        self.clip_counts = clip_counts 
        self.use_tfidf = use_tfidf
        
        
    def fit(self, X, y=None):
        
        self.keep_columns = None
        self.vectorizer = DictVectorizer(dtype=np.int)
        
        self.tfidf_transformer = None
        if self.use_tfidf:
            self.tfidf_transformer = TfidfTransformer()
        
        if self.clip_counts:
            bags_of_words = X.apply(lambda tokens: Counter(set(tokens)))
        else:
            bags_of_words = X.apply(lambda tokens: Counter(tokens))
        
        X_vectors = self.vectorizer.fit_transform(bags_of_words)
        
        self.keep_columns = np.array(X_vectors.sum(axis=0) >= self.min_frequency).squeeze()

        if self.use_tfidf:
            self.tfidf_transformer.fit(X_vectors[:, self.keep_columns])
        
        return self
    
    def transform(self, X):
        
        if self.clip_counts:
            bags_of_words = X.apply(lambda tokens: Counter(set(tokens)))
        else:
            bags_of_words = X.apply(lambda tokens: Counter(tokens))
        
        X_vectors = self.vectorizer.transform(bags_of_words)
        X_vectors = X_vectors[:, self.keep_columns]
        if self.use_tfidf:
            X_vectors = self.tfidf_transformer.transform(X_vectors)
        
        return X_vectors


print("Examples of bags of words without normalization (raw counts per document):")
print(BagOfWords(min_frequency=2, use_tfidf=False).fit_transform(noticia_tokenizada[:5]).toarray(), end='\n\n\n')

print("The same examples as above but with counts clipped to 1:")
print(BagOfWords(min_frequency=2, clip_counts=True).fit_transform(noticia_tokenizada[:5]).toarray(), end='\n\n\n')

print("The same examples as above but with TF-IDF normalization:")
print(np.around(BagOfWords(min_frequency=2, use_tfidf=True).fit_transform(noticia_tokenizada[:5]).toarray(), decimals=1))

Examples of bags of words without normalization (raw counts per document):
[[ 0  2  2 ...  0  0  0]
 [ 0  0  0 ...  0  0  4]
 [ 0  2  0 ...  0  0  0]
 [ 2  0  0 ...  2 14  0]
 [ 0  0  0 ...  0  0  0]]


The same examples as above but with counts clipped to 1:
[[1 1 1 1 0 1 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 0 1 1 1 0 0 1 1 1 1
  1 0 1 1 1 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0
  0 0 1 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 0 0 1 1 1 0]
 [0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 1 0 0 1 1 1 0 0 1 1 0 0 0 0 0
  1 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0]
 [1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 0
  0 0 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0]
 [0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 0
  1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1

## Cross validation de un LinearSVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
evaluate_model(
    LinearSVC(random_state=0, tol=1e-5),
    BagOfWords(min_frequency=2).fit_transform(noticia_tokenizada),
    df_y)

Unnamed: 0_level_0,test,test,test,test,test,test
Unnamed: 0_level_1,recall_macro,accuracy,recall_neg,recall_pos,precision_neg,precision_pos
0,0.98936,0.989362,0.989474,0.989247,0.989474,0.989247
1,0.968195,0.968085,0.957895,0.978495,0.978495,0.957895
2,0.96248,0.962567,0.978723,0.946237,0.948454,0.977778
3,0.994624,0.994652,1.0,0.989247,0.989474,1.0
4,0.978552,0.97861,0.989362,0.967742,0.96875,0.989011
5,0.962594,0.962567,0.957447,0.967742,0.967742,0.957447
6,0.973404,0.973262,0.946809,1.0,1.0,0.94898
7,0.925189,0.925134,0.914894,0.935484,0.934783,0.915789
8,0.941203,0.941176,0.93617,0.946237,0.946237,0.93617
9,0.951259,0.951872,0.989474,0.913043,0.921569,0.988235


## Cross validation de una regresión logistica

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced')

evaluate_model(
    log_reg,
    BagOfWords(min_frequency=2).fit_transform(noticia_tokenizada),
    df_y)

Unnamed: 0_level_0,test,test,test,test,test,test
Unnamed: 0_level_1,recall_macro,accuracy,recall_neg,recall_pos,precision_neg,precision_pos
0,0.983984,0.984043,0.989474,0.978495,0.979167,0.98913
1,0.973458,0.973404,0.968421,0.978495,0.978723,0.968085
2,0.978609,0.97861,0.978723,0.978495,0.978723,0.978495
3,0.989247,0.989305,1.0,0.978495,0.979167,1.0
4,0.983871,0.983957,1.0,0.967742,0.969072,1.0
5,0.962594,0.962567,0.957447,0.967742,0.967742,0.957447
6,0.968085,0.967914,0.93617,1.0,1.0,0.939394
7,0.946465,0.946524,0.957447,0.935484,0.9375,0.956044
8,0.935827,0.935829,0.93617,0.935484,0.93617,0.935484
9,0.951259,0.951872,0.989474,0.913043,0.921569,0.988235


## Cross validation de un Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

evaluate_model(
    RandomForestClassifier(max_depth=10, random_state=0),
    BagOfWords(min_frequency=2).fit_transform(noticia_tokenizada),
    df_y)

Unnamed: 0_level_0,test,test,test,test,test,test
Unnamed: 0_level_1,recall_macro,accuracy,recall_neg,recall_pos,precision_neg,precision_pos
0,0.962705,0.962766,0.968421,0.956989,0.958333,0.967391
1,0.984211,0.984043,0.968421,1.0,1.0,0.96875
2,0.973118,0.973262,1.0,0.946237,0.949495,1.0
3,0.983871,0.983957,1.0,0.967742,0.969072,1.0
4,0.946237,0.946524,1.0,0.892473,0.903846,1.0
5,0.957104,0.957219,0.978723,0.935484,0.938776,0.977528
6,0.909117,0.909091,0.904255,0.913978,0.913978,0.904255
7,0.866221,0.86631,0.882979,0.849462,0.85567,0.877778
8,0.941203,0.941176,0.93617,0.946237,0.946237,0.93617
9,0.978432,0.97861,0.989474,0.967391,0.969072,0.988889


## Cross Validation de una red neuronal

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(activation='logistic', alpha=1e-5, hidden_layer_sizes=(50, 50), random_state=1)
evaluate_model(
    clf,
    BagOfWords(min_frequency=2).fit_transform(noticia_tokenizada),
    df_y)

Unnamed: 0_level_0,test,test,test,test,test,test
Unnamed: 0_level_1,recall_macro,accuracy,recall_neg,recall_pos,precision_neg,precision_pos
0,0.994624,0.994681,1.0,0.989247,0.989583,1.0
1,0.963045,0.962766,0.936842,0.989247,0.988889,0.938776
2,0.967914,0.967914,0.968085,0.967742,0.968085,0.967742
3,0.994624,0.994652,1.0,0.989247,0.989474,1.0
4,0.989305,0.989305,0.989362,0.989247,0.989362,0.989247
5,0.962594,0.962567,0.957447,0.967742,0.967742,0.957447
6,0.941375,0.941176,0.904255,0.978495,0.977011,0.91
7,0.941261,0.941176,0.925532,0.956989,0.956044,0.927083
8,0.935884,0.935829,0.925532,0.946237,0.945652,0.926316
9,0.973169,0.973262,0.978947,0.967391,0.96875,0.978022


## Creamos el modelo para importarlo de la red neuronal

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

final_model = Pipeline([
    ('cls',MLPClassifier(activation='logistic', alpha=1e-5, hidden_layer_sizes=(50, 50), random_state=1))
])

final_model.fit(BagOfWords(min_frequency=2).fit_transform(noticia_tokenizada).toarray(), df_y)

Pipeline(steps=[('cls',
                 MLPClassifier(activation='logistic', alpha=1e-05,
                               hidden_layer_sizes=(50, 50), random_state=1))])

In [None]:
import joblib
joblib.dump(final_model, 'modelo_odio.pkl', compress=9)

['modelo_odio.pkl']

In [None]:
from collections import Counter
known_words = [element for list_ in noticia_tokenizada.values for element in list_]
known_dup = []
for key,value in Counter(known_words).items():
  if value >1:
    known_dup.append(key)


In [None]:
import pickle
open_file = open("word_list.pkl", "wb")
pickle.dump(known_dup, open_file)
open_file.close()
