In [76]:
import pandas as pd

# 1. Loading Data

In [77]:
emotions_df = pd.read_csv("./datasets/emotions/text.csv")
emotions_df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [78]:
violence_df = pd.read_csv("datasets/gender_violence/train.csv")
violence_df.head()

Unnamed: 0,Tweet_ID,tweet,type
0,ID_0022DWKP,Had a dream i got raped last night. By a guy i...,sexual_violence
1,ID_00395QYM,he thought the word raped means sex and told m...,sexual_violence
2,ID_003EOSSF,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,ID_004BBHOD,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,ID_004F7516,Chessy Prout can do better by telling the trut...,sexual_violence


In [79]:
hate_df = pd.read_csv("datasets/Hate_speech/labeled_data.csv")
hate_df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# 2. Data Preprocessing

In [80]:
emotions_df.drop(columns='Unnamed: 0', inplace=True)
violence_df.drop(columns='Tweet_ID', inplace=True)
hate_df.drop(columns=['Unnamed: 0','count',	'hate_speech',	'offensive_language',	'neither'], inplace=True)

In [81]:
emotions_df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [82]:
violence_df.head()

Unnamed: 0,tweet,type
0,Had a dream i got raped last night. By a guy i...,sexual_violence
1,he thought the word raped means sex and told m...,sexual_violence
2,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,Chessy Prout can do better by telling the trut...,sexual_violence


In [83]:
hate_df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Renaming Columns

In [None]:
violence_df.rename(columns={'tweet':'text','type':'label'},inplace=True)
hate_df.rename(columns={'tweet':'text','class':'label'},inplace=True)

In [62]:
hate_df.columns, violence_df.columns,emotions_df.columns

(Index(['label', 'text'], dtype='object'),
 Index(['text', 'label'], dtype='object'),
 Index(['text', 'label'], dtype='object'))

# Checking Null Values

In [63]:
emotions_df.isna().sum()

text     0
label    0
dtype: int64

In [64]:
violence_df.isna().sum()

text     0
label    0
dtype: int64

In [65]:
hate_df.isna().sum()

label    0
text     0
dtype: int64

In [66]:
emotions_df.shape, violence_df.shape,hate_df.shape

((416809, 2), (39650, 2), (24783, 2))

In [67]:
emotions_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,121187
1,141067
2,34554
3,57317
4,47712
5,14972


In [68]:
emotions_df[emotions_df.label==0].sample(12000)

Unnamed: 0,text,label
321877,i seriously feel like im becoming more and mor...,0
312006,i look and feel horrible,0
364443,i want this feeling of being hopeless to be re...,0
158052,i want to be someone im not something im not t...,0
224020,i must have slept for a while to be feeling as...,0
...,...,...
37714,i think it would feel less tragic,0
152254,i feel that im doomed,0
312257,i just feel terrible about this confusion,0
116113,i feel assaulted by him and everything he repr...,0


In [69]:
e_df = pd.DataFrame()
for i in range(0,6):
    sample_df = emotions_df[emotions_df.label==i].sample(n=2000,random_state=42)
    e_df = pd.concat([e_df,sample_df])

In [70]:
e_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2000
1,2000
2,2000
3,2000
4,2000
5,2000


In [71]:
emotions_df = e_df.copy()
emotions_df.shape

(12000, 2)

In [72]:
violence_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Harmful_Traditional_practice,188
Physical_violence,5946
economic_violence,217
emotional_violence,651
sexual_violence,32648


In [73]:
12000-violence_df[violence_df.label!='sexual_violence'].shape[0]

4998

In [74]:
sexual_v = violence_df[violence_df.label=='sexual_violence'].sample(4998,random_state=42)
sample_v_df = violence_df[violence_df.label!='sexual_violence']
v_df = pd.concat([sample_v_df,sexual_v])
violence_df = v_df.copy()
violence_df.shape

(44648, 2)

In [75]:
violence_df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Harmful_Traditional_practice,188
Physical_violence,5946
economic_violence,217
emotional_violence,651
sexual_violence,37646
