In [None]:
import spacy
import nltk
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns

from collections import Counter
from xgboost import XGBClassifier
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Collecting es_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2 MB)
[K     |████████████████████████████████| 16.2 MB 2.1 MB/s 
Building wheels for co

In [None]:
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

!python -m spacy download es

Reading the pickle with the data already transformed

In [None]:
df_train = pd.read_pickle('./data/df_train.pickle')
df_train = df_train[df_train['final_label']!='unknown']
df_test = pd.read_pickle('./data/df_test.pickle')

Function that executes the tf-idf and runs the XGBoost

In [None]:
def xgb_tune(data,data_test,fraction,column,n_features,learning_rate,n_estimators,max_depth,min_child_weight,subsample,colsample_bytree,max_df,min_df,gamma,la):
  if fraction > 0.99:
    training_data=data
    testing_data=data_test
  else:
    training_data = data.sample(frac=fraction, random_state=25)
    testing_data = pd.concat([data.drop(training_data.index),data_test])
  tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df =min_df, max_features=n_features, use_idf=True)
  tfidf_vectorizer = tfidf_vectorizer.fit(training_data[column])
  tfidf_train = tfidf_vectorizer.transform(training_data[column])
  tfidf_test = tfidf_vectorizer.transform(testing_data[column])
  X_train = tfidf_train.toarray()
  y_train = training_data['final_label'].map({'non-racist': 1, 'racist' :0}).to_numpy()
  X_test = tfidf_test.toarray()
  y_test = testing_data['final_label'].map({'non-racist': 1, 'racist' :0}).to_numpy()
  model = XGBClassifier(random_state=42, learning_rate=learning_rate,n_estimators=n_estimators,max_depth=max_depth,min_child_weight=min_child_weight,subsample=subsample,colsample_bytree=colsample_bytree,gamma=gamma,reg_lambda=la)
  results = model.fit(X_train,y_train)
  preds = results.predict(X_test)
  return results,preds,y_test,tfidf_vectorizer

Training the XGBoost with the df_test data

In [None]:
results,preds,y_test,tfidf_vectorizer = xgb_tune(df_train,df_test,1.0,'stemm_str',5000,0.2,100,10,1,0.8,0.8,0.9,2,0,1)
X_test = tfidf_vectorizer.transform(df_test['stemm_str']).toarray()
y_test = df_test['final_label'].map({'non-racist': 1, 'racist' :0}).to_numpy()
preds_test = results.predict(X_test)
print(f'f1-score of the test: {metrics.f1_score(y_test, preds_test)}')
X_train = tfidf_vectorizer.transform(df_train['stemm_str']).toarray()
y_train = df_train['final_label'].map({'non-racist': 1, 'racist' :0}).to_numpy()
preds_sample = results.predict(X_train)
print(f'f1-score of the train: {metrics.f1_score(y_train, preds_sample)}')

f1-score of the test: 0.8999999999999999
f1-score of the train: 0.9355563881603597


Reading the data from the evaluation_final.csv file and preparing it for the XGBoost

In [None]:
df_evaluation = pd.read_csv('./data/evaluation_final_original.csv', sep='|', header=0)
df_evaluation = df_evaluation.loc[:, ['message', 'label']]

def normalize(text,nlp):
  doc = nlp(text)
  words = [t.lemma_ for t in doc if not t.is_punct | t.is_stop]
  lexical_tokens = [t.lower() for t in words if len(t) > 3 and t.isalpha()]
  return lexical_tokens

def get_wordnet_pos(tag):
  if tag.startswith('J'):
      return wordnet.ADJ
  elif tag.startswith('V'):
      return wordnet.VERB
  elif tag.startswith('N'):
      return wordnet.NOUN
  elif tag.startswith('R'):
      return wordnet.ADV
  else:
      return wordnet.NOUN

t = TweetTokenizer()
df_evaluation['tokenized'] = df_evaluation['message'].apply(t.tokenize)
df_evaluation['lower'] = df_evaluation['tokenized'].apply(lambda x: [word.lower() for word in x])
punc = string.punctuation+'...¿¡..“'
df_evaluation['no_punc'] = df_evaluation['lower'].apply(lambda x: [word for word in x if word not in punc])
stop_words = set(stopwords.words('spanish'))
df_evaluation['stopwords_removed'] = df_evaluation['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
df_evaluation['pos_tags'] = df_evaluation['stopwords_removed'].apply(nltk.tag.pos_tag)
df_evaluation['wordnet_pos'] = df_evaluation['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
wnl = WordNetLemmatizer()
df_evaluation['lemmatized'] = df_evaluation['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df_evaluation['lemma_str'] = [' '.join(map(str,l)) for l in df_evaluation['lemmatized']]
spanish_stemmer = SnowballStemmer('spanish')
df_evaluation['stemm'] = df_evaluation['wordnet_pos'].apply(lambda x: [spanish_stemmer.stem(word) for word, tag in x])
df_evaluation['stemm_str'] = [' '.join(map(str,l)) for l in df_evaluation['stemm']]
nlp = spacy.load('es')
df_evaluation['lemma_spacy'] = df_evaluation['message'].apply(lambda x: normalize(x,nlp))
df_evaluation['lemma_spacy_str'] = [' '.join(map(str,l)) for l in df_evaluation['lemma_spacy']]
tweet_len = []
for index, row in df_evaluation.iterrows():
    tweet_len.append(len(row['lemma_str']))
df_evaluation['tweet_len'] = tweet_len
df_evaluation['word_count'] = df_evaluation['lemmatized'].apply(lambda x: len(str(x).split()))

Generating the tf-idf for the evaluation final usgin the vector

In [None]:
tfidf_evalutation = tfidf_vectorizer.transform(df_evaluation['stemm_str'])
X_eval = tfidf_evalutation.toarray()
preds = results.predict(X_eval)

Saving the csv file with the results

In [None]:
df_evaluation_print = pd.read_csv('./data/evaluation_final_original.csv', sep='|', header=0)
df_evaluation_print['label'] = preds
df_evaluation_print['label'] = df_evaluation_print['label'].map({1 : 'non-racist', 0: 'racist'})
df_evaluation_print.to_csv('./data/evaluation_final_xgb1.csv')

Saving model and vector to be used in ensemble

In [None]:
import pickle
pickle.dump(results, open('./models/xgb_reg_best.pkl', "wb"))
pickle.dump(tfidf_vectorizer, open('./models/xgb_tfidf_vec_best.pkl', "wb"))