# 前処理

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM, RNN, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

#データ読み込み
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
#辞書を定義
#\bは空白とか記号⇒これに挟まれた文字を変換している
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
    
    "w/e": "whatever",
    "usagov": "usa government",
    "recentlu": "recently",
    "ph0tos": "photos",
    "amirite": "am i right",
    "exp0sed": "exposed",
    "<3": "love",
    "luv": "love",
    "amageddon": "armageddon",
    "trfc": "traffic",
    "16yr": "16 year",
    
    "mh370": "malaysia airlines flight 370",
    "okwx": "oklahoma city weather",
    "arwx": "arkansas weather",    
    "gawx": "georgia weather",  
    "scwx": "south carolina weather",  
    "cawx": "california weather",
    "tnwx": "tennessee weather",
    "azwx": "arizona weather",  
    "alwx": "alabama weather",
    "usnwsgov": "united states national weather service",
    "2mw": "tomorrow",
    
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", 
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

keys = [i for i in repl.keys()]

In [4]:
RE_PATTERNS = {
    'american':
        [
            'amerikan'
        ],

    'adolf':
        [
            'adolf'
        ],


    'hitler':
        [
            'hitler'
        ],

    'fuck':
        [
            '(f)(u|[^a-z0-9 ])(c|[^a-z0-9 ])(k|[^a-z0-9 ])([^ ])*',
            '(f)([^a-z]*)(u)([^a-z]*)(c)([^a-z]*)(k)',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*k', 'f u u c',
            '(f)(c|[^a-z ])(u|[^a-z ])(k)', r'f\*',
            'feck ', ' fux ', 'f\*\*', 
            'f\-ing', 'f\.u\.', 'f###', ' fu ', 'f@ck', 'f u c k', 'f uck', 'f ck'
        ],

    'ass':
        [
            '[^a-z]ass ', '[^a-z]azz ', 'arrse', ' arse ', '@\$\$'
                                                           '[^a-z]anus', ' a\*s\*s', '[^a-z]ass[^a-z ]',
            'a[@#\$%\^&\*][@#\$%\^&\*]', '[^a-z]anal ', 'a s s'
        ],

    'ass hole':
        [
            ' a[s|z]*wipe', 'a[s|z]*[w]*h[o|0]+[l]*e', '@\$\$hole'
        ],

    'bitch':
        [
            'b[w]*i[t]*ch', 'b!tch',
            'bi\+ch', 'b!\+ch', '(b)([^a-z]*)(i)([^a-z]*)(t)([^a-z]*)(c)([^a-z]*)(h)',
            'biatch', 'bi\*\*h', 'bytch', 'b i t c h'
        ],

    'bastard':
        [
            'ba[s|z]+t[e|a]+rd'
        ],

    'trans gender':
        [
            'transgender'
        ],

    'gay':
        [
            'gay'
        ],

    'cock':
        [
            '[^a-z]cock', 'c0ck', '[^a-z]cok ', 'c0k', '[^a-z]cok[^aeiou]', ' cawk',
            '(c)([^a-z ])(o)([^a-z ]*)(c)([^a-z ]*)(k)', 'c o c k'
        ],

    'dick':
        [
            ' dick[^aeiou]', 'deek', 'd i c k'
        ],

    'suck':
        [
            'sucker', '(s)([^a-z ]*)(u)([^a-z ]*)(c)([^a-z ]*)(k)', 'sucks', '5uck', 's u c k'
        ],

    'cunt':
        [
            'cunt', 'c u n t'
        ],

    'bull shit':
        [
            'bullsh\*t', 'bull\$hit'
        ],

    'homo sex ual':
        [
            'homosexual'
        ],

    'jerk':
        [
            'jerk'
        ],

    'idiot':
        [
            'i[d]+io[t]+', '(i)([^a-z ]*)(d)([^a-z ]*)(i)([^a-z ]*)(o)([^a-z ]*)(t)', 'idiots'
                                                                                      'i d i o t'
        ],

    'dumb':
        [
            '(d)([^a-z ]*)(u)([^a-z ]*)(m)([^a-z ]*)(b)'
        ],

    'shit':
        [
            'shitty', '(s)([^a-z ]*)(h)([^a-z ]*)(i)([^a-z ]*)(t)', 'shite', '\$hit', 's h i t'
        ],

    'shit hole':
        [
            'shythole'
        ],

    'retard':
        [
            'returd', 'retad', 'retard', 'wiktard', 'wikitud'
        ],

    'rape':
        [
            ' raped'
        ],

    'dumb ass':
        [
            'dumbass', 'dubass'
        ],

    'ass head':
        [
            'butthead'
        ],

    'sex':
        [
            'sexy', 's3x', 'sexuality'
        ],


    'nigger':
        [
            'nigger', 'ni[g]+a', ' nigr ', 'negrito', 'niguh', 'n3gr', 'n i g g e r'
        ],

    'shut the fuck up':
        [
            'stfu'
        ],

    'pussy':
        [
            'pussy[^c]', 'pusy', 'pussi[^l]', 'pusses'
        ],

    'faggot':
        [
            'faggot', ' fa[g]+[s]*[^a-z ]', 'fagot', 'f a g g o t', 'faggit',
            '(f)([^a-z ]*)(a)([^a-z ]*)([g]+)([^a-z ]*)(o)([^a-z ]*)(t)', 'fau[g]+ot', 'fae[g]+ot',
        ],

    'mother fucker':
        [
            ' motha ', ' motha f', ' mother f', 'motherucker',
        ],

    'whore':
        [
            'wh\*\*\*', 'w h o r e'
        ],
}

In [5]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s)

In [8]:
import re
from tqdm import tqdm
#データの箱を作成
new_train_data = []
new_test_data = []
#学習、テストデータをリストに変換
ltr = train["comment_text"].tolist()
lte = test["comment_text"].tolist()

#学習データをループ
for i in tqdm(ltr):
    arr = str(i).split() #スペースで分割
    xx = ""
    
    for j in arr:
        
        if re.match(r".*[A-Z].*[A-Z].*", j):
#             print('{}を大文字に変換しました'.format(j))
            j=str(j).upper()
        else:
            j = str(j).lower() #小文字に変換
            
        if j[:4] == 'http' or j[:3] == 'www' or j[:4] == 'HTTP' or j[:3] == 'WWW': #文章中にアドレスがある場合があるので、削除
            continue
            
        #もしkeysの中に一致するものがあればreplで変換
        if j in keys:
#             print("{}を{}に変換しました...".format(j,repl[j]))
            j = repl[j]
    
#        表記の揺れを統一
        for re_i,re_j in RE_PATTERNS.items():
            for re_n in re_j:
                if re.match(re_n,j):
                   # print('{}を{}に変換しました'.format(j,re.sub(re_n,re_i,j)))
                    j=re.sub(re_n,re_i,j)
    
        #記号を分離
        j=tokenize(j)
#         print(j)
        
        xx += j + " " #単語と空白を追加
    new_train_data.append(xx)

100%|████████████████████████████████████████████████████████████████████████| 159571/159571 [2:44:38<00:00, 16.15it/s]


In [9]:
#trainデータも同様
for i in tqdm(lte):
    arr = str(i).split() #スペースで分割
    xx = ""
    
    for j in arr:
        
        if re.match(r".*[A-Z].*[A-Z].*", j):
#             print('{}を大文字に変換しました'.format(j))
            j=str(j).upper()
        else:
            j = str(j).lower() #小文字に変換
            
        if j[:4] == 'http' or j[:3] == 'www' or j[:4] == 'HTTP' or j[:3] == 'WWW': #文章中にアドレスがある場合があるので、削除
            continue
            
        #もしkeysの中に一致するものがあればreplで変換
        if j in keys:
#             print("{}を{}に変換しました...".format(j,repl[j]))
            j = repl[j]
    
        #表記の揺れを統一
        for re_i,re_j in RE_PATTERNS.items():
            for re_n in re_j:
                if re.match(re_n,j):
#                     print('{}を{}に変換しました'.format(j,re.sub(re_n,re_i,j)))
                    j=re.sub(re_n,re_i,j)
    
        #記号を分離
        j=tokenize(j)
#         print(j)
        
        xx += j + " " #単語と空白を追加
    new_test_data.append(xx)

100%|████████████████████████████████████████████████████████████████████████| 153164/153164 [2:35:52<00:00, 16.38it/s]


In [10]:
train["new_comment_text"] = new_train_data
test["new_comment_text"] = new_test_data

train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,new_comment_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d ' aww ! he matches this background colour i...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man , i am really not trying to edit war ..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i can not make any real suggestions o..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you , sir , are my hero . any chance you re..."


In [15]:
trate = train["new_comment_text"].tolist()
tete = test["new_comment_text"].tolist()
# for i, c in tqdm(enumerate(trate)):
#     trate[i] = re.sub('[^a-zA-Z ?!]+', '', trate[i])
#     trate[i] = re.sub('[^a-zA-Z ?!]+', '', str(trate[i]).lower())
# for i, c in tqdm(enumerate(tete)):
#     tete[i] = re.sub('[^a-zA-Z ?!]+', '', tete[i])
#     tete[i] = re.sub('[^a-zA-Z ?!]+', '', str(tete[i]).lower())
train["comment_text"] = trate
test["comment_text"] = tete
print('only alphabets')

only alphabets


In [16]:
train.head(50)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,new_comment_text
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,d ' aww ! he matches this background colour i...,0,0,0,0,0,0,d ' aww ! he matches this background colour i...
2,000113f07ec002fd,"hey man , i am really not trying to edit war ...",0,0,0,0,0,0,"hey man , i am really not trying to edit war ..."
3,0001b41b1c6bb37e,""" more i can not make any real suggestions o...",0,0,0,0,0,0,""" more i can not make any real suggestions o..."
4,0001d958c54c6e35,"you , sir , are my hero . any chance you re...",0,0,0,0,0,0,"you , sir , are my hero . any chance you re..."
5,00025465d4725e87,""" congratulations from me as well , use the...",0,0,0,0,0,0,""" congratulations from me as well , use the..."
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
7,00031b1e95af7921,your vandalism to the matt shirvington article...,0,0,0,0,0,0,your vandalism to the matt shirvington article...
8,00037261f536c51d,sorry if the word ' nonsense ' was offensive...,0,0,0,0,0,0,sorry if the word ' nonsense ' was offensive...
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,alignment on this subject and which are contra...


In [17]:
train.to_csv('train_preprocessing_upper_allfeature.csv',index=False)

In [18]:
test.to_csv('test_preprocessing_upper_allfeature.csv',index=False)