In [None]:
!pip install transformers -q

[K     |████████████████████████████████| 1.9MB 5.7MB/s 
[K     |████████████████████████████████| 3.2MB 63.2MB/s 
[K     |████████████████████████████████| 890kB 34.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


# Import

In [None]:
import os, sys, gc, re
import pandas as pd
import numpy as np

In [None]:
import time
import logging
from tqdm.notebook import tqdm

In [None]:
from transformers import AutoTokenizer, AutoConfig

# Read Data

In [None]:
path = 'data/'

In [None]:
train = pd.read_csv(path+'Train.csv')
test = pd.read_csv(path+'Test.csv')

In [None]:
train.head()

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1
3,U0TTYY8,ak slouma,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1


In [None]:
test.head()

Unnamed: 0,ID,text
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik
1,5HY6UEY,ye gernabou ye 9a7ba
2,ATNVUJX,saber w barra rabbi m3ak 5ouya
3,Q9XYVOQ,cha3ébbb ta7aaaaannnnnnnnnnn tfouuhh
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...


# Processing

In [None]:
model_name = 'bashar-talafha/multi-dialect-bert-base-arabic'

In [None]:
conf = AutoConfig.from_pretrained(model_name)
tok = AutoTokenizer.from_pretrained(model_name, config=conf)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=334032.0, style=ProgressStyle(descripti…




In [None]:
train['length'] = train['text'].apply(lambda x: len(tok.encode(x)))

In [None]:
test['length'] = test['text'].apply(lambda x: len(tok.encode(x)))

In [None]:
train.length.describe([.25, .5, .75, .8, .85, .9, .95])

count    70000.000000
mean        30.431243
std         41.819379
min          4.000000
25%         12.000000
50%         20.000000
75%         34.000000
80%         40.000000
85%         48.000000
90%         61.000000
95%         88.000000
max       4926.000000
Name: length, dtype: float64

In [None]:
test.length.describe([.25, .5, .75, .8, .85, .9, .95])

count    30000.000000
mean        31.624667
std         40.656748
min          4.000000
25%         13.000000
50%         20.000000
75%         35.000000
80%         41.000000
85%         50.000000
90%         64.000000
95%         93.000000
max       2111.000000
Name: length, dtype: float64

In [None]:
def preprocess_token(s):
    s = re.sub(r"(\w)(\1){2,}", r"\1\1", s)
    s = re.sub(r"(?:(\w)(\w))(\1\2){2,}", r"\1\2\1\2\1\2", s)
    s = re.sub(r"(?:(\w)(\w)(\w))(\1\2\3){2,}", r"\1\2\3\1\2\3", s)
    return s

In [None]:
train['text_norm'] = train['text'].apply(preprocess_token)

In [None]:
test['text_norm'] = test['text'].apply(preprocess_token)

In [None]:
train.head()

Unnamed: 0,ID,text,label,length,text_norm
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,44,3sbaa lek ou le seim riahi ou 3sbaa le ca
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,24,cha3eb fey9elkoum menghir ta7ayoul ou kressi
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,196,bereau degage nathef ya slim walahi ya7chiw fi...
3,U0TTYY8,ak slouma,1,6,ak slouma
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,63,entom titmanou lina a7na 3iid moubarik a7na ch...


In [None]:
test.head()

Unnamed: 0,ID,text,length,text_norm
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik,24,barcha aaindou fiha hak w barcha teflim kadhalik
1,5HY6UEY,ye gernabou ye 9a7ba,15,ye gernabou ye 9a7ba
2,ATNVUJX,saber w barra rabbi m3ak 5ouya,16,saber w barra rabbi m3ak 5ouya
3,Q9XYVOQ,cha3ébbb ta7aaaaannnnnnnnnnn tfouuhh,26,cha3ébb ta7aann tfouuhh
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35,rabi y5alihoulek w yfar7ek bih w inchallah itc...


In [None]:
train['length_norm'] = train['text_norm'].apply(lambda x: len(tok.encode(x)))

In [None]:
test['length_norm'] = test['text_norm'].apply(lambda x: len(tok.encode(x)))

In [None]:
train.head()

Unnamed: 0,ID,text,label,length,text_norm,length_norm
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,44,3sbaa lek ou le seim riahi ou 3sbaa le ca,23
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,24,cha3eb fey9elkoum menghir ta7ayoul ou kressi,24
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,196,bereau degage nathef ya slim walahi ya7chiw fi...,196
3,U0TTYY8,ak slouma,1,6,ak slouma,6
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,63,entom titmanou lina a7na 3iid moubarik a7na ch...,63


In [None]:
test.head()

Unnamed: 0,ID,text,length,text_norm,length_norm
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik,24,barcha aaindou fiha hak w barcha teflim kadhalik,24
1,5HY6UEY,ye gernabou ye 9a7ba,15,ye gernabou ye 9a7ba,15
2,ATNVUJX,saber w barra rabbi m3ak 5ouya,16,saber w barra rabbi m3ak 5ouya,16
3,Q9XYVOQ,cha3ébbb ta7aaaaannnnnnnnnnn tfouuhh,26,cha3ébb ta7aann tfouuhh,17
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35


In [None]:
train.length_norm.describe([.25, .5, .75, .8, .85, .9, .95])

count    70000.000000
mean        29.564871
std         41.529792
min          4.000000
25%         12.000000
50%         19.000000
75%         33.000000
80%         39.000000
85%         47.000000
90%         59.000000
95%         86.050000
max       4925.000000
Name: length_norm, dtype: float64

In [None]:
test.length_norm.describe([.25, .5, .75, .8, .85, .9, .95])

count    30000.000000
mean        30.767233
std         40.397913
min          4.000000
25%         12.000000
50%         19.000000
75%         34.000000
80%         40.000000
85%         49.000000
90%         63.000000
95%         92.000000
max       2111.000000
Name: length_norm, dtype: float64

In [None]:
def shorten_text(texts, max_tokens=100):
  encoded = tok(texts, 
                truncation=True,
                padding=True,
                max_length=max_tokens,
                return_attention_mask=False,
                return_tensors='np',
  )

  return tok.batch_decode(encoded.input_ids, skip_special_tokens=True)

In [None]:
train['text_short'] = shorten_text(train['text_norm'].values.tolist())

In [None]:
test['text_short'] = shorten_text(test['text_norm'].values.tolist())

In [None]:
train['length_short'] = train['text_short'].apply(lambda x: len(tok.encode(x)))

In [None]:
test['length_short'] = test['text_short'].apply(lambda x: len(tok.encode(x)))

In [None]:
train.head()

Unnamed: 0,ID,text,label,length,text_norm,length_norm,text_short,length_short
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1,44,3sbaa lek ou le seim riahi ou 3sbaa le ca,23,3sbaa lek ou le seim riahi ou 3sbaa le ca,23
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1,24,cha3eb fey9elkoum menghir ta7ayoul ou kressi,24,cha3eb fey9elkoum menghir ta7ayoul ou kressi,24
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1,196,bereau degage nathef ya slim walahi ya7chiw fi...,196,bereau degage nathef ya slim walahi ya7chiw fi...,100
3,U0TTYY8,ak slouma,1,6,ak slouma,6,ak slouma,6
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,-1,63,entom titmanou lina a7na 3iid moubarik a7na ch...,63,entom titmanou lina a7na 3iid moubarik a7na ch...,63


In [None]:
test.head()

Unnamed: 0,ID,text,length,text_norm,length_norm,text_short,length_short
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik,24,barcha aaindou fiha hak w barcha teflim kadhalik,24,barcha aaindou fiha hak w barcha teflim kadhalik,24
1,5HY6UEY,ye gernabou ye 9a7ba,15,ye gernabou ye 9a7ba,15,ye gernabou ye 9a7ba,15
2,ATNVUJX,saber w barra rabbi m3ak 5ouya,16,saber w barra rabbi m3ak 5ouya,16,saber w barra rabbi m3ak 5ouya,16
3,Q9XYVOQ,cha3ébbb ta7aaaaannnnnnnnnnn tfouuhh,26,cha3ébb ta7aann tfouuhh,17,cha3ebb ta7aann tfouuhh,17
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35


In [None]:
train.length_short.describe([.25, .5, .75, .8, .85, .9, .95])

count    70000.000000
mean        27.194771
std         22.858648
min          4.000000
25%         12.000000
50%         19.000000
75%         33.000000
80%         39.000000
85%         47.000000
90%         59.000000
95%         86.050000
max        100.000000
Name: length_short, dtype: float64

In [None]:
test.length_short.describe([.25, .5, .75, .8, .85, .9, .95])

count    30000.000000
mean        27.998233
std         23.638611
min          4.000000
25%         12.000000
50%         19.000000
75%         34.000000
80%         40.000000
85%         49.000000
90%         63.000000
95%         92.000000
max        100.000000
Name: length_short, dtype: float64

In [None]:
gc.collect()

100

# Save them all

In [None]:
train_df = train[['ID', 'text_short', 'length_short', 'label']]
test_df = test[['ID', 'text_short', 'length_short']]

In [None]:
train_df.rename(columns={'text_short': 'text', 'length_short': 'length'}, inplace=True)
test_df.rename(columns={'text_short': 'text', 'length_short': 'length'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
train_df.head()

Unnamed: 0,ID,text,length,label
0,13P0QT0,3sbaa lek ou le seim riahi ou 3sbaa le ca,23,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,24,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,100,-1
3,U0TTYY8,ak slouma,6,1
4,68DX797,entom titmanou lina a7na 3iid moubarik a7na ch...,63,-1


In [None]:
test_df.head()

Unnamed: 0,ID,text,length
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik,24
1,5HY6UEY,ye gernabou ye 9a7ba,15
2,ATNVUJX,saber w barra rabbi m3ak 5ouya,16
3,Q9XYVOQ,cha3ebb ta7aann tfouuhh,17
4,TOAHLRH,rabi y5alihoulek w yfar7ek bih w inchallah itc...,35


In [None]:
train_df.to_csv(path+'TrainNormalized.csv', index=False)
test_df.to_csv(path+'TestNormalized.csv', index=False)