## Libraries 

In [110]:
import pandas as pd 

## Data set wassem_css.csv

In [111]:
# Import the data set 
wassem_1 = pd.read_csv('Data/wassem_css.csv', sep='\t', error_bad_lines=True)

In [112]:
# Shape 
wassem_1.shape

(6667, 6)

In [113]:
# preview  
wassem_1.head(2)

Unnamed: 0,Tweet_ID,Previous,User_ID,Text,Expert_Annotation,Amateur_Annotations
0,570775709343461377,5.707751e+17,24414732,@TVBachelor @NewJerzeyBoy what about the femin...,both,"[neither,sexism,sexism]"
1,596730741490253824,5.9673e+17,13857342,@ThelmaSleaze uh... did you watch the video? o...,both,"[neither,racism,sexism]"


In [114]:
# Select tweets and Annotations 
wassem_1 = wassem_1[['Text','Expert_Annotation']]

In [115]:
# Annotations (Labels)
wassem_1['Expert_Annotation'].value_counts()

neither    5729
sexism      815
racism       88
both         35
Name: Expert_Annotation, dtype: int64

In [116]:
# Extract Racism dataset and Sexism dataset 
racism_data_1 = wassem_1[wassem_1['Expert_Annotation']!='sexism']
sexism_data_1 = wassem_1[wassem_1['Expert_Annotation']!='racism']

## Data set wassem_hovy_naacl.csv

In [117]:
# Import the data set 
wassem_2 = pd.read_csv('Data/wassem_hovy_naacl.csv', sep='\t')

In [118]:
# Shape 
wassem_2.shape

(16202, 5)

In [119]:
# Preview 
wassem_2.head(2)

Unnamed: 0,Tweet_ID,Previous,User_ID,Text,Label
0,567080871472017408,5.670786e+17,2941145694,@truaemusic The followers of the religion give...,racism
1,566098518616449024,5.660874e+17,2941145694,@Love___Egypt @washingtonpost All the land tha...,racism


In [120]:
# Select tweets and Annotations 
wassem_2 = wassem_2[['Text','Label']]

In [121]:
# Extract Racism dataset and Sexism dataset 
racism_data_2 = wassem_2[wassem_2['Label']!='sexism']
sexism_data_2 = wassem_2[wassem_2['Label']!='racism']

## Racism Dataset 

In [122]:
# Annotations of racism_data_1
racism_data_1['Expert_Annotation'].value_counts()

neither    5729
racism       88
both         35
Name: Expert_Annotation, dtype: int64

In [123]:
# Annotations of racism_data_2
racism_data_2['Label'].value_counts()

none      11115
racism     1939
Name: Label, dtype: int64

In [124]:
# rename columns 
racism_data_1.columns = ["Tweets","Labels"]
racism_data_2.columns = ["Tweets","Labels"]

In [125]:
# combine racism_data_1 & racism_data_2
racism = pd.concat([racism_data_1,racism_data_2], axis=0, ignore_index=True)

In [126]:
# Labels 
racism['Labels'].value_counts()

none       11115
neither     5729
racism      2027
both          35
Name: Labels, dtype: int64

In [127]:
# Encoding (racist tweet = 1, none = 0)
racism['Labels'].replace('none','0', inplace=True)
racism['Labels'].replace('neither','0', inplace=True)
racism['Labels'].replace('racism','1', inplace=True)
racism['Labels'].replace('both','1', inplace=True)

In [128]:
# Labels (verification) 
racism['Labels'].value_counts()

0    16844
1     2062
Name: Labels, dtype: int64

In [129]:
# Export the racism dataset 
racism.to_csv('Data/racism.csv', index_label=False)

## Sexism data set 

In [130]:
# Annotations of sexism_data_1
sexism_data_1['Expert_Annotation'].value_counts()

neither    5729
sexism      815
both         35
Name: Expert_Annotation, dtype: int64

In [131]:
# Annotations of sexism_data_2
sexism_data_2['Label'].value_counts()

none      11115
sexism     3148
Name: Label, dtype: int64

In [132]:
# rename columns 
sexism_data_1.columns = ["Tweets","Labels"]
sexism_data_2.columns = ["Tweets","Labels"]

In [133]:
# combine sexism_data_1 & sexism_data_2
sexism = pd.concat([sexism_data_1,sexism_data_2], axis=0, ignore_index=True)

In [134]:
# Labels
sexism['Labels'].value_counts()

none       11115
neither     5729
sexism      3963
both          35
Name: Labels, dtype: int64

In [135]:
# Encoding (sexist tweet = 1, none = 0)
sexism['Labels'].replace('none','0', inplace=True)
sexism['Labels'].replace('neither','0', inplace=True)
sexism['Labels'].replace('sexism','1', inplace=True)
sexism['Labels'].replace('both','1', inplace=True)

In [136]:
# Labels (verification) 
sexism['Labels'].value_counts()

0    16844
1     3998
Name: Labels, dtype: int64

In [137]:
# Export the sexism dataset 
sexism.to_csv('Data/sexism.csv', index_label=False)

## Discrimination data set (Racism and Sexism) 

In [138]:
# Racism data set 
racism['Labels'].value_counts()

0    16844
1     2062
Name: Labels, dtype: int64

In [139]:
# Sexism data set 
sexism['Labels'].value_counts()

0    16844
1     3998
Name: Labels, dtype: int64

In [140]:
# convert the value 1 to 2 for sexism 
sexism['Labels'][sexism['Labels'] == '1'] = '2'

In [141]:
# verification 
sexism['Labels'].value_counts()

0    16844
2     3998
Name: Labels, dtype: int64

In [143]:
# Combine racism and sexism 
discrimination = pd.concat([racism,sexism[sexism['Labels']=='2']], axis=0, ignore_index=True)
discrimination.columns = ['Tweets','MultiClass_Labels']

print("discrimination dataset shape : ", discrimination.shape)
discrimination['MultiClass_Labels'].value_counts()

discrimination dataset shape :  (22904, 2)


0    16844
2     3998
1     2062
Name: MultiClass_Labels, dtype: int64

In [144]:
# Preview 
discrimination.head()

Unnamed: 0,Tweets,MultiClass_Labels
0,@TVBachelor @NewJerzeyBoy what about the femin...,1
1,@ThelmaSleaze uh... did you watch the video? o...,1
2,Managed to hit a bird and a small rodent on my...,1
3,"If you believe a #holohoax or #gays, LGBT &amp...",1
4,"Rid yourself of #Feminazi, #Fag &amp; #Jewf. E...",1


In [145]:
# Create a MultiClass Label column 
disc = []
for label in discrimination['MultiClass_Labels']: 
    if label != '0': disc.append('1') 
    else : disc.append('0')
Labels = pd.DataFrame(disc)
Labels.value_counts()

0    16844
1     6060
dtype: int64

In [148]:
# Add Binary Labels 
discrimination['Labels'] = Labels
discrimination['Labels'].value_counts()

0    16844
1     6060
Name: Labels, dtype: int64

In [150]:
# Preview 
discrimination.head()

Unnamed: 0,Tweets,MultiClass_Labels,Labels
0,@TVBachelor @NewJerzeyBoy what about the femin...,1,1
1,@ThelmaSleaze uh... did you watch the video? o...,1,1
2,Managed to hit a bird and a small rodent on my...,1,1
3,"If you believe a #holohoax or #gays, LGBT &amp...",1,1
4,"Rid yourself of #Feminazi, #Fag &amp; #Jewf. E...",1,1


In [151]:
discrimination.to_csv('Data/discrimination.csv', index_label=False)