# Load Data

## Import

In [None]:
import pandas as pd
import re
import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# CLEANED
df_cleaned = pd.read_csv("/content/drive/MyDrive/HateSpeech/TODA/TODA_cleand.csv").iloc[:,2:]
# # CLEANED
df_typo = pd.read_csv("/content/drive/MyDrive/HateSpeech/TODA/TODA_typo.csv").iloc[:,1:]

In [None]:
df_cleaned = df_cleaned[df_cleaned['cleaned'].apply(len)<1000].reset_index(drop=True)[['text','cleaned','class']]
df_typo = df_typo[['text','cleaned','jspell','class']]

In [None]:
df_cleaned.shape,df_typo.shape

((321716, 3), (321716, 4))

In [None]:
pd.concat([df_cleaned.head(),df_typo.head()])

Unnamed: 0,text,cleaned,class,jspell
0,This is what the left is really scared of. ...,This is what the left is really scared of,0,
1,That literally looks like a monkey. Why are we...,That literally looks like a monkey Why are we ...,0,
2,It makes you an asshole.,It makes you an asshole,0,
3,"So they manage to provide a whole lot of data,...",So they manage to provide a whole lot of data ...,0,
4,"Hi there, i,m Keith, i hope you are doing well...",Hi there im Keith i hope you are doing well i ...,0,
0,This is what the left is really scared of. ...,This is what the left is really scared of,0,This is what the left is really scared of
1,That literally looks like a monkey. Why are we...,That literally looks like a monkey Why are we ...,0,That literally looks like a monkey Why are we ...
2,It makes you an asshole.,It makes you an asshole,0,It makes you an asshole
3,"So they manage to provide a whole lot of data,...",So they manage to provide a whole lot of data ...,0,So they manage to provide a whole lot of data ...
4,"Hi there, i,m Keith, i hope you are doing well...",Hi there im Keith i hope you are doing well i ...,0,Hi there is Keith i hope you are doing well i ...


## Preprocess

In [None]:
from sklearn.utils import shuffle
df_cleaned = shuffle(df_cleaned,random_state=42).reset_index(drop=True)
df_typo = shuffle(df_typo,random_state=42).reset_index(drop=True)

In [None]:
pd.concat([df_cleaned.head(),df_typo.head()])

Unnamed: 0,text,cleaned,class,jspell
0,@BreitbartNews Idiots seem to come out of the ...,@BreitbartNews Idiots seem to come out of the ...,1,
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,
2,@user so much stuff happening in florida! firs...,@user so much stuff happening in florida! firs...,0,
3,"@user #pennydreadful a long, dark love poem ð...",@user #pennydreadful a long dark love poem i...,0,
4,@user @user @user this is how i wanted to spnd...,@user @user @user this is how i wanted to spnd...,0,
0,@BreitbartNews Idiots seem to come out of the ...,@BreitbartNews Idiots seem to come out of the ...,1,@BreitbartNews Idiots seem to come out of the ...
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,russia as a huge black hole where everything d...
2,@user so much stuff happening in florida! firs...,@user so much stuff happening in florida! firs...,0,@user so much stuff happening in florida! firs...
3,"@user #pennydreadful a long, dark love poem ð...",@user #pennydreadful a long dark love poem i...,0,@user #pennydreadful a long dark love poem i...
4,@user @user @user this is how i wanted to spnd...,@user @user @user this is how i wanted to spnd...,0,@user @user @user this is how i wanted to send...


In [None]:
from sklearn.utils import shuffle

def preprocess(text):
  # URL 제거
  text = re.sub('http\S+', '', text)
  # 멘션 제거
  text = re.sub('@\S+', '', text)
  # 해쉬 및 특수문자 제거
  delete_e = re.compile("[^a-zA-Z0-9\!\?\$\%\' ]")
  text = delete_e.sub("",text)
  text = text.lower()
  return text

In [None]:
df_cleaned['cleaned'] = df_cleaned['cleaned'].apply(preprocess)
df_typo['cleaned'] = df_typo['cleaned'].apply(preprocess)
df_typo['jspell'] = df_typo['jspell'].apply(preprocess)

In [None]:
pd.concat([df_cleaned.head(),df_typo.head()])

Unnamed: 0,text,cleaned,class,jspell
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,idiots seem to come out of the woodwork it h...
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,russia as a huge black hole where everything d...
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,so much stuff happening in florida! first orl...
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,pennydreadful a long dark love poem i agree...
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,this is how i wanted to send father's day w...


## Tokenizing

In [None]:
nlp = spacy.load('en_core_web_sm')
tokenizer = Tokenizer(nlp.vocab)

df_cleaned['Tokens'] = df_cleaned['cleaned'].apply(lambda x: [ token.text for token in tokenizer(x)])
df_typo['Tokens'] = df_typo['jspell'].apply(lambda x: [ token.text for token in tokenizer(x)])

In [None]:
pd.concat([df_cleaned.head(),df_typo.head()])

Unnamed: 0,text,cleaned,class,Tokens,jspell
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,"[ , idiots, seem, to, come, out, of, the, wood...",
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,"[russia, as, a, huge, black, hole, where, ever...",
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,"[ , so, much, stuff, happening, in, florida!, ...",
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,"[ , pennydreadful, a, long, dark, love, poem,...",
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,"[ , this, is, how, i, wanted, to, spnd, fath...",
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,"[ , idiots, seem, to, come, out, of, the, wood...",idiots seem to come out of the woodwork it h...
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,"[russia, as, a, huge, black, hole, where, ever...",russia as a huge black hole where everything d...
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,"[ , so, much, stuff, happening, in, florida!, ...",so much stuff happening in florida! first orl...
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,"[ , pennydreadful, a, long, dark, love, poem,...",pennydreadful a long dark love poem i agree...
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,"[ , this, is, how, i, wanted, to, send, fath...",this is how i wanted to send father's day w...


# STOPWORD

In [None]:
STOP_WORDS = nlp.Defaults.stop_words

# CLEANED DATA
i=0
tokens = []
for doc in nlp.pipe(df_cleaned['cleaned']):
  doc_tokens = []
  for token in doc:
    if (token.text not in STOP_WORDS)&(token.is_punct==False)&(token.is_space==False):
      doc_tokens.append(token.text)
  i+=1
  if i%100000==0:print(i)
  tokens.append(doc_tokens)
df_cleaned['Not Stopwords'] = tokens
print('Cleaned Data Tokenized!!!')

# TYPO DATA
i=0
tokens = []
for doc in nlp.pipe(df_typo['jspell']):
  doc_tokens = []
  for token in doc:
    if (token.text not in STOP_WORDS)&(token.is_punct==False)&(token.is_space==False):
      doc_tokens.append(token.text)
  i+=1
  if i%100000==0:print(i)
  tokens.append(doc_tokens)
df_typo['Not Stopwords'] = tokens
print('Typo Data Tokenized!!!')

100000
200000
300000
Cleaned Data Tokenized!!!
100000
200000
300000
Typo Data Tokenized!!!


In [None]:
pd.concat([df_cleaned.head(),df_typo.head()])

Unnamed: 0,text,cleaned,class,Tokens,Not Stopwords,jspell
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,"[ , idiots, seem, to, come, out, of, the, wood...","[idiots, come, woodwork, threatens, president,...",
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,"[russia, as, a, huge, black, hole, where, ever...","[russia, huge, black, hole, disappears, goes, ...",
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,"[ , so, much, stuff, happening, in, florida!, ...","[stuff, happening, florida, orlando, shooting,...",
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,"[ , pennydreadful, a, long, dark, love, poem,...","[pennydreadful, long, dark, love, poem, agree]",
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,"[ , this, is, how, i, wanted, to, spnd, fath...","[wanted, spnd, father, day, wknd, wboy, grilln...",
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,"[ , idiots, seem, to, come, out, of, the, wood...","[idiots, come, woodwork, threatens, president,...",idiots seem to come out of the woodwork it h...
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,"[russia, as, a, huge, black, hole, where, ever...","[russia, huge, black, hole, disappears, goes, ...",russia as a huge black hole where everything d...
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,"[ , so, much, stuff, happening, in, florida!, ...","[stuff, happening, florida, orlando, shooting,...",so much stuff happening in florida! first orl...
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,"[ , pennydreadful, a, long, dark, love, poem,...","[pennydreadful, long, dark, love, poem, agree]",pennydreadful a long dark love poem i agree...
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,"[ , this, is, how, i, wanted, to, send, fath...","[wanted, send, father, day, wind, boy, grill, ...",this is how i wanted to send father's day w...


# STEMMING

In [None]:
ps = PorterStemmer()

# CLEANED DATA
i=0
tokens = []
for doc in df_cleaned['Not Stopwords']:
  doc_tokens = []
  for token in doc:
    try:
      doc_tokens.append(ps.stem(token))
    except:
      print(token)
      doc_tokens.append(token)
  tokens.append(doc_tokens)
  i+=1
  if i%100000==0:print(i)
df_cleaned['stems'] = tokens
print('Cleaned Data Stemmed!!!')

# TYPO DATA
i=0
tokens = []
for doc in df_typo['Not Stopwords']:
  doc_tokens = []
  for token in doc:
    try:
      doc_tokens.append(ps.stem(token))
    except:
      print(token)
      doc_tokens.append(token)
  tokens.append(doc_tokens)
  i+=1
  if i%100000==0:print(i)
df_typo['stems'] = tokens
print('Typo Data Stemmed!!!')

100000
200000
300000
Cleaned Data Stemmed!!!
100000
200000
300000
Typo Data Stemmed!!!


In [None]:
pd.concat([df_cleaned.head(),df_typo.head()])

Unnamed: 0,text,cleaned,class,Tokens,Not Stopwords,stems,jspell
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,"[ , idiots, seem, to, come, out, of, the, wood...","[idiots, come, woodwork, threatens, president,...","[idiot, come, woodwork, threaten, presid, jail]",
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,"[russia, as, a, huge, black, hole, where, ever...","[russia, huge, black, hole, disappears, goes, ...","[russia, huge, black, hole, disappear, goe, wh...",
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,"[ , so, much, stuff, happening, in, florida!, ...","[stuff, happening, florida, orlando, shooting,...","[stuff, happen, florida, orlando, shoot, disne...",
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,"[ , pennydreadful, a, long, dark, love, poem,...","[pennydreadful, long, dark, love, poem, agree]","[pennydread, long, dark, love, poem, agre]",
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,"[ , this, is, how, i, wanted, to, spnd, fath...","[wanted, spnd, father, day, wknd, wboy, grilln...","[want, spnd, father, day, wknd, wboy, grilln, ...",
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,1,"[ , idiots, seem, to, come, out, of, the, wood...","[idiots, come, woodwork, threatens, president,...","[idiot, come, woodwork, threaten, presid, jail]",idiots seem to come out of the woodwork it h...
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,0,"[russia, as, a, huge, black, hole, where, ever...","[russia, huge, black, hole, disappears, goes, ...","[russia, huge, black, hole, disappear, goe, wh...",russia as a huge black hole where everything d...
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,0,"[ , so, much, stuff, happening, in, florida!, ...","[stuff, happening, florida, orlando, shooting,...","[stuff, happen, florida, orlando, shoot, disne...",so much stuff happening in florida! first orl...
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,0,"[ , pennydreadful, a, long, dark, love, poem,...","[pennydreadful, long, dark, love, poem, agree]","[pennydread, long, dark, love, poem, agre]",pennydreadful a long dark love poem i agree...
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,0,"[ , this, is, how, i, wanted, to, send, fath...","[wanted, send, father, day, wind, boy, grill, ...","[want, send, father, day, wind, boy, grill, n,...",this is how i wanted to send father's day w...


# SAVE DATA

In [None]:
# CLEANED DATA
df_cleaned_s = df_cleaned[['text','cleaned','Tokens','Not Stopwords','stems','class']]
df_cleaned_s.columns = ['text_raw','text_cleaned','tokens_raw','tokens_stop','tokens_stems','class']
# TYPO DATA
df_typo_s = df_typo[['text','cleaned','jspell','Tokens','Not Stopwords','stems','class']]
df_typo_s.columns = ['text_raw','text_cleaned','typo_corrected','tokens_raw','tokens_stop','tokens_stems','class']

In [None]:
pd.concat([df_typo_s.head(),df_cleaned_s.head()])

Unnamed: 0,text_raw,text_cleaned,typo_corrected,tokens_raw,tokens_stop,tokens_stems,class
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,idiots seem to come out of the woodwork it h...,"[ , idiots, seem, to, come, out, of, the, wood...","[idiots, come, woodwork, threatens, president,...","[idiot, come, woodwork, threaten, presid, jail]",1
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,"[russia, as, a, huge, black, hole, where, ever...","[russia, huge, black, hole, disappears, goes, ...","[russia, huge, black, hole, disappear, goe, wh...",0
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,so much stuff happening in florida! first orl...,"[ , so, much, stuff, happening, in, florida!, ...","[stuff, happening, florida, orlando, shooting,...","[stuff, happen, florida, orlando, shoot, disne...",0
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,pennydreadful a long dark love poem i agree...,"[ , pennydreadful, a, long, dark, love, poem,...","[pennydreadful, long, dark, love, poem, agree]","[pennydread, long, dark, love, poem, agre]",0
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,this is how i wanted to send father's day w...,"[ , this, is, how, i, wanted, to, send, fath...","[wanted, send, father, day, wind, boy, grill, ...","[want, send, father, day, wind, boy, grill, n,...",0
0,@BreitbartNews Idiots seem to come out of the ...,idiots seem to come out of the woodwork it h...,,"[ , idiots, seem, to, come, out, of, the, wood...","[idiots, come, woodwork, threatens, president,...","[idiot, come, woodwork, threaten, presid, jail]",1
1,russia as a huge black hole where everything d...,russia as a huge black hole where everything d...,,"[russia, as, a, huge, black, hole, where, ever...","[russia, huge, black, hole, disappears, goes, ...","[russia, huge, black, hole, disappear, goe, wh...",0
2,@user so much stuff happening in florida! firs...,so much stuff happening in florida! first orl...,,"[ , so, much, stuff, happening, in, florida!, ...","[stuff, happening, florida, orlando, shooting,...","[stuff, happen, florida, orlando, shoot, disne...",0
3,"@user #pennydreadful a long, dark love poem ð...",pennydreadful a long dark love poem i agree...,,"[ , pennydreadful, a, long, dark, love, poem,...","[pennydreadful, long, dark, love, poem, agree]","[pennydread, long, dark, love, poem, agre]",0
4,@user @user @user this is how i wanted to spnd...,this is how i wanted to spnd father's day w...,,"[ , this, is, how, i, wanted, to, spnd, fath...","[wanted, spnd, father, day, wknd, wboy, grilln...","[want, spnd, father, day, wknd, wboy, grilln, ...",0


In [None]:
df_cleaned_s.to_csv("/content/drive/MyDrive/HateSpeech/TODA(CLEANED)/BERT/TODA_CLEANED_01.csv")
df_typo_s.to_csv("/content/drive/MyDrive/HateSpeech/TODA(CLEANED)/BERT/TODA_TYPO_01.csv")