In [76]:
import pandas as pd

# 1. Loading Data

In [77]:
emotions_df = pd.read_csv("./datasets/emotions/text.csv")
emotions_df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [78]:
violence_df = pd.read_csv("datasets/gender_violence/train.csv")
violence_df.head()

Unnamed: 0,Tweet_ID,tweet,type
0,ID_0022DWKP,Had a dream i got raped last night. By a guy i...,sexual_violence
1,ID_00395QYM,he thought the word raped means sex and told m...,sexual_violence
2,ID_003EOSSF,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,ID_004BBHOD,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,ID_004F7516,Chessy Prout can do better by telling the trut...,sexual_violence


In [79]:
hate_df = pd.read_csv("datasets/Hate_speech/labeled_data.csv")
hate_df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# 2. Data Preprocessing

In [80]:
emotions_df.drop(columns='Unnamed: 0', inplace=True)
violence_df.drop(columns='Tweet_ID', inplace=True)
hate_df.drop(columns=['Unnamed: 0','count',	'hate_speech',	'offensive_language',	'neither'], inplace=True)

In [81]:
emotions_df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [82]:
violence_df.head()

Unnamed: 0,tweet,type
0,Had a dream i got raped last night. By a guy i...,sexual_violence
1,he thought the word raped means sex and told m...,sexual_violence
2,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,Chessy Prout can do better by telling the trut...,sexual_violence


In [83]:
hate_df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Renaming Columns

In [84]:
violence_df.rename(columns={'tweet':'text','type':'label'},inplace=True)
hate_df.rename(columns={'tweet':'text','class':'label'},inplace=True)

In [85]:
hate_df.columns, violence_df.columns,emotions_df.columns

(Index(['label', 'text'], dtype='object'),
 Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'))

# Checking Null Values

In [86]:
emotions_df.isna().sum()

text     0
label    0
dtype: int64

In [87]:
violence_df.isna().sum()

text     0
label    0
dtype: int64

In [88]:
hate_df.isna().sum()

label    0
text     0
dtype: int64

In [89]:
emotions_df.shape, violence_df.shape,hate_df.shape

((416809, 2), (39650, 2), (24783, 2))

In [90]:
emotions_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,121187
1,141067
2,34554
3,57317
4,47712
5,14972


In [91]:
emotions_df[emotions_df.label==0].sample(12000)

Unnamed: 0,text,label
135871,i personally would prefer a shorter life fille...,0
145940,i feel it should be ignored totally,0
382603,i had the sensation of vomiting dizziness and ...,0
171636,i know that when i eat horribly i feel horrible,0
296149,i feel like i missed a big opportunity but at ...,0
...,...,...
187392,i feel disappointed disappointed in myself tha...,0
252361,i am frustrated because i feel so rotten and t...,0
42221,i was feeling so crappy on my birthday is that...,0
250581,ive decided that i have nothing to feel regret...,0


# Extract Sample from emotions Dataset

In [92]:
e_df = pd.DataFrame()
for i in range(0,6):
    sample_df = emotions_df[emotions_df.label==i].sample(n=2000,random_state=42)
    e_df = pd.concat([e_df,sample_df])

In [93]:
e_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2000
1,2000
2,2000
3,2000
4,2000
5,2000


In [94]:
emotions_df = e_df.copy()
emotions_df.shape

(12000, 2)

In [95]:
violence_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Harmful_Traditional_practice,188
Physical_violence,5946
economic_violence,217
emotional_violence,651
sexual_violence,32648


In [96]:
12000-violence_df[violence_df.label!='sexual_violence'].shape[0]

4998

# Extract Sample from violence Dataset

In [97]:
sexual_v = violence_df[violence_df.label=='sexual_violence'].sample(4998,random_state=42)
sample_v_df = violence_df[violence_df.label!='sexual_violence']
v_df = pd.concat([sample_v_df,sexual_v])
violence_df = v_df.copy()
violence_df.shape

(12000, 2)

In [98]:
violence_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Harmful_Traditional_practice,188
Physical_violence,5946
economic_violence,217
emotional_violence,651
sexual_violence,4998


# Extract Sample from hate Dataset

In [99]:
hate_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,1430
1,19190
2,4163


In [100]:
12000-hate_df[hate_df.label!=1].shape[0]

6407

In [101]:
zero_hDf = hate_df[hate_df.label==1].sample(6407,random_state=42)
sample_h_df = hate_df[hate_df.label!=1]
h_df = pd.concat([sample_h_df,zero_hDf])
hate_df = h_df.copy()
hate_df.shape

(12000, 2)

In [105]:
hate_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,1430
1,6407
2,4163


In [106]:
hate_df.shape, violence_df.shape,emotions_df.shape

((12000, 2), (12000, 2), (12000, 2))

In [108]:
hate_df.sample(3)

Unnamed: 0,label,text
23054,1,Y'all weird on here. Fake hoes for attention
1986,2,&#9733;@&#9733;@&#9733;@&#9733;@&#9733; GRAND ...
6292,1,@jqualley_ @_BeautifulKeezy bitch YESSSSS I ma...


In [109]:
violence_df.sample(3)

Unnamed: 0,text,label
6928,From the person who told me. He says the fell...,sexual_violence
13773,Happy Birthday to my husband The most lovely ...,Physical_violence
11697,What are some things that make you really happ...,Physical_violence


In [110]:
emotions_df.sample(3)

Unnamed: 0,text,label
166413,i said i feel resentful that my childhood was ...,3
180277,i have been busy but i feel that my loyal read...,2
377736,i do these cards once in a while is that i fee...,1


# Replace Indexes

In [111]:
hate_df.reset_index(drop=True,inplace=True)
emotions_df.reset_index(drop=True,inplace=True)
violence_df.reset_index(drop=True,inplace=True)

In [112]:
hate_df.sample(3)

Unnamed: 0,label,text
8214,1,Another bad bitch fuck her for a hour another ...
1118,2,@SalaciousSully hopefully not back in da ghetto?
3925,0,RT @PacDaGoat: I really hate attention seeking...


In [113]:
violence_df.sample(3)

Unnamed: 0,text,label
1867,My Wife Beats Me Too Much â€“ Husband Cries Out ...,Physical_violence
1544,The day my husband beats me. The day he is rea...,Physical_violence
8735,ðŸ‡ºðŸ‡¸: He Broke Up with me ðŸ‡®ðŸ‡³: He Raped Me,sexual_violence


In [114]:
emotions_df.sample(3)

Unnamed: 0,text,label
10754,i feel like i ve got some weird self inflicted...,5
9568,i feel more like i m an intruder or robber and...,4
9666,i have noticed that it is okay to feel unsure ...,4


# 3.Label Encoding

In [116]:
from sklearn.preprocessing import LabelEncoder

In [117]:
l_encoder = LabelEncoder()
violence_df['label']=l_encoder.fit_transform(violence_df['label'])
violence_df.head()

Unnamed: 0,text,label
0,"My Husband Beats Me Frequently, Wife Tells Cou...",1
1,"Best thing for me to do, is remain silent when...",1
2,"My husband will never beat me, Bambam denies r...",1
3,"theyre like, i just wanna be a baby maker with...",1
4,"I was in England for a week, the longest Iâ€™ve ...",1


In [118]:
violence_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,188
1,5946
2,217
3,651
4,4998


# 4. Stopword Removal

In [119]:
import spacy


In [120]:
from spacy.lang.en.stop_words import STOP_WORDS

In [122]:
len(STOP_WORDS)

326

In [123]:
nlp = spacy.load("en_core_web_sm")

In [124]:
def removeStop(sent):
    doc = nlp(sent)
    j = []
    for token in doc:
        if not token.is_stop:
            j.append(token.text)

    return " ".join(j);

In [128]:
removeStop("We jUst open our wings")

'open wings'

In [130]:
emotions_df['text']=emotions_df['text'].apply(removeStop)
violence_df['text']=violence_df['text'].apply(removeStop)
hate_df['text']=hate_df['text'].apply(removeStop)

In [132]:
violence_df['text'].head()

0      Husband Beats Frequently , Wife Tells Court |  
1    Best thing , remain silent return work today ....
2    husband beat , Bambam denies rumour TeddyA bea...
3    like , wanna baby maker zero sexual autonomy ,...
4    England week , longest away . husband said soo...
Name: text, dtype: object

# 5. Tokenization and Padding