In [None]:
!python -m pip install --upgrade spacy
!python -m spacy download es_core_news_sm

Collecting spacy
  Downloading spacy-3.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 5.4 MB/s 
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 44.8 MB/s 
[?25hCollecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (653 kB)
[K     |████████████████████████████████| 653 kB 36.5 MB/s 
Collecting pathy>=0.3.5
  Downloading pathy-0.6.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.3 MB/s 
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 23.4 MB/s 
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.2-py3-none-any.whl (7.2 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Downloading spacy_legacy-3.

In [None]:
import string

import nltk
import spacy
import pandas as pd

from pathlib import Path
from collections import Counter

from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from google.colab import drive

nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

drive.mount('/content/drive/') 
data_path = Path('drive/MyDrive/Datathon2022/data/')

[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping corpora/nonbreaking_prefixes.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Mounted at /content/drive/


In [None]:
def prepare_df(data):

  def normalize(text,nlp):
    doc = nlp(text)
    words = [t.lemma_ for t in doc if not t.is_punct | t.is_stop]
    lexical_tokens = [t.lower() for t in words if len(t) > 3 and t.isalpha()]
    return lexical_tokens
  
  def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

  if data['label'].dtype == 'O':
    df_clean = data.groupby(['message'])['label'].apply(lambda x: ','.join(x)).reset_index()
    df_clean['label'] = df_clean['label'].str.split(',')
    df_clean.reset_index(inplace=True)
    df_clean.drop(columns='index',inplace=True)
    final_label = []
    support_label = []
    for index, row in df_clean.iterrows():
        if len(set(row['label'])) == 1:
            final_label.append(row['label'][0])
            support_label.append('strong')
        else:
            c = Counter(row['label'])
            if 'racist' in c.keys() and 'non-racist' in c.keys():
                if c['racist'] >= c['non-racist']:
                    final_label.append('racist')
                    support_label.append('mild')
                else:
                    final_label.append('non-racist')
                    support_label.append('mild')
            elif 'racist' in c.keys() and 'unknown' in c.keys():
                final_label.append('racist')
                support_label.append('mild')
            elif 'non-racist' in c.keys() and 'unknown' in c.keys():
                final_label.append('non-racist')
                support_label.append('mild')
    df_clean['final_label']=final_label
    df_clean['support_label']=support_label
    df_clean = df_clean.loc[:, ['message', 'final_label','support_label']]
  else:
    df_clean = data
    
  t = TweetTokenizer()
  df_clean['tokenized'] = df_clean['message'].apply(t.tokenize)
  df_clean['lower'] = df_clean['tokenized'].apply(lambda x: [word.lower() for word in x])
  punc = string.punctuation+'...¿¡..“'
  df_clean['no_punc'] = df_clean['lower'].apply(lambda x: [word for word in x if word not in punc])
  stop_words = set(stopwords.words('spanish'))
  df_clean['stopwords_removed'] = df_clean['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
  df_clean['pos_tags'] = df_clean['stopwords_removed'].apply(nltk.tag.pos_tag)
  df_clean['wordnet_pos'] = df_clean['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
  wnl = WordNetLemmatizer()
  df_clean['lemmatized'] = df_clean['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
  df_clean['lemma_str'] = [' '.join(map(str,l)) for l in df_clean['lemmatized']]
  spanish_stemmer = SnowballStemmer('spanish')
  df_clean['stemm'] = df_clean['wordnet_pos'].apply(lambda x: [spanish_stemmer.stem(word) for word, tag in x])
  df_clean['stemm_str'] = [' '.join(map(str,l)) for l in df_clean['stemm']]
  nlp = spacy.load('es_core_news_sm')
  df_clean['lemma_spacy'] = df_clean['message'].apply(lambda x: normalize(x,nlp))
  df_clean['lemma_spacy_str'] = [' '.join(map(str,l)) for l in df_clean['lemma_spacy']]
  tweet_len = []
  for index, row in df_clean.iterrows():
      tweet_len.append(len(row['lemma_str']))
  df_clean['tweet_len'] = tweet_len
  df_clean['word_count'] = df_clean['lemmatized'].apply(lambda x: len(str(x).split()))

  return df_clean

In [None]:
df_train = pd.read_csv(Path(data_path,'labels_racism.csv'), sep='|', header=0)
df_train = prepare_df(df_train)
df_train.to_pickle(Path(data_path, 'df_train.pickle'))

In [None]:
df_test = pd.read_csv(Path(data_path,'evaluation_sample.csv'), sep='|', header=0)
df_test = prepare_df(df_test)
df_test.to_pickle(Path(data_path, 'df_test.pickle'))

In [None]:
df_paper = pd.read_csv(Path(data_path, 'paper_input_tweets.csv'), sep='|', header=0)
df_paper = df_paper.rename(columns={'text': 'message', 'target': 'label'})

In [None]:
df_paper = df_paper.drop('Unnamed: 0', axis=1)
df_paper = prepare_df(df_paper)
df_paper.to_pickle(Path(data_path, 'df_paper.pickle'))

In [None]:
df_public = pd.read_csv(Path(data_path, 'evaluation_public.csv'), sep='|', header=0)
df_public = prepare_df(df_public)
df_public.to_pickle(Path(data_path, 'df_public.pickle'))