In [2]:
import pandas as pd
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

In [3]:
df = pd.read_csv('Train.csv')
del df['Tweet_ID']
df.head()

Unnamed: 0,tweet,type
0,Had a dream i got raped last night. By a guy i...,sexual_violence
1,he thought the word raped means sex and told m...,sexual_violence
2,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence
3,I was sexually abused for 3 years at age 4 to ...,sexual_violence
4,Chessy Prout can do better by telling the trut...,sexual_violence


In [4]:
df['type'].value_counts()

sexual_violence                 32648
Physical_violence                5946
emotional_violence                651
economic_violence                 217
Harmful_Traditional_practice      188
Name: type, dtype: int64

In [5]:
# Label Encoding

labels = df.type.unique()

label_dict = {}
for index, label in enumerate(labels):
    label_dict[label] = index

label_dict

{'sexual_violence': 0,
 'Physical_violence': 1,
 'emotional_violence': 2,
 'Harmful_Traditional_practice': 3,
 'economic_violence': 4}

In [6]:
df['label'] = df.type.replace(label_dict)
df

Unnamed: 0,tweet,type,label
0,Had a dream i got raped last night. By a guy i...,sexual_violence,0
1,he thought the word raped means sex and told m...,sexual_violence,0
2,She NOT TALKING TO ME I WAS RAPED BY 2 MEN 1 M...,sexual_violence,0
3,I was sexually abused for 3 years at age 4 to ...,sexual_violence,0
4,Chessy Prout can do better by telling the trut...,sexual_violence,0
...,...,...,...
39645,"ENTRY 1299: 21F. 23M, BF’s cousin. Got drunk o...",sexual_violence,0
39646,So you’re telling me Emmanuel Macron was groom...,sexual_violence,0
39647,"My wife regularly beats me, I get dirty slaps ...",Physical_violence,1
39648,Me: Hey babe! Police officer boyfriend: is tha...,sexual_violence,0


In [17]:
"""
Train and Validation Split
Because the labels are imbalanced, we split the data set in a stratified fashion, using this as the class labels.
"""


from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.tweet.values, df.label.values, test_size=0.15, random_state=42, stratify=df.label.values)

In [16]:
df['data_type'] = ['not_set'] * df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['type', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tweet
type,label,data_type,Unnamed: 3_level_1
Harmful_Traditional_practice,3,train,160
Harmful_Traditional_practice,3,val,28
Physical_violence,1,train,5054
Physical_violence,1,val,892
economic_violence,4,train,185
economic_violence,4,val,32
emotional_violence,2,train,553
emotional_violence,2,val,98
sexual_violence,0,train,27750
sexual_violence,0,val,4898


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [18]:
X_train

array(["The Keepers is a hard documentary to watch. It's about priests raping kids. Lots of people have come out about the abuse even Gabriel Bryne. The one priest I knew as a kid befriended me. Nothing happened but I later learned he had raped lots of boys. He eventually killed himself.",
       'Y’all dumb bitchs be like “he raped me” when ever someone tried to help you from drowning like next time u gonna drown',
       'CyberAnonymous: NigeriaNewsdesk: My husband beats me for drinking his cans of milk – Wife tells court …  ',
       ...,
       'Women can literally just say "he raped me" with minimal evidence and the man gets 30+ years in prison',
       'I agree. Corey was such a good man when he screamed at me for wearing pants &amp; when he screamed at me to do his drugs &amp; he was also a good man when he raped my friend! Bless his heart! &amp; bless all of you for enabling &amp; supporting a rapist! He couldn’t have done it without you all!',
       'I was in contact with SOS

In [20]:
vectorizer = CountVectorizer()

# fit_transform our tweets values to numerical vectors
x_train_counts = vectorizer.fit_transform(X_train)
x_train_counts

<33702x34199 sparse matrix of type '<class 'numpy.int64'>'
	with 1045823 stored elements in Compressed Sparse Row format>

In [21]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf.shape

(33702, 34199)

In [23]:
# train.type.values our y values
clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB())])
clf = clf.fit(X_train, y_train)

In [26]:
predicted = clf.predict(X_val)

In [29]:
import numpy as np

np.unique(predicted)


array([0, 1, 2])

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val,predicted)

0.8918964357767317

In [33]:
sample_submission = pd.read_csv("SampleSubmission.csv")

In [35]:
test = pd.read_csv("Test.csv")
test_predict = clf.predict(test.tweet)

In [None]:
np.unique(test_predict)