## Importing the data from the drive

In [None]:
! pip install gdown

In [None]:
import gdown 
url = 'https://drive.google.com/uc?export=download&id=1mykSFmHt-DXpobTQPlmweF8IqVwFtA4v' 
output = 'GBV.zip'
gdown.download(url, output)

In [None]:
! unzip GBV.zip

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import transformers
import torch 
import torch.nn as nn
import warnings

import io

import tensorflow as tf

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, TruncatedSVD

warnings.filterwarnings("ignore")

%matplotlib inline
sns.set(rc={'figure.figsize':(11.7,8.27)})

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)


### Importing the Data


In [None]:
data_file = 'synoym_30000.csv'
test_data_file = "test_data.csv"

In [None]:
df = pd.read_csv(data_file)

In [None]:
df.head()

In [None]:
test_df = pd.read_csv(test_data_file)
test_df.head()

### Removing duplicates

In [None]:
tweets  = []
types = []
for i in range(5):
    df_tmp = df[df["type"]==i]["tweet"].unique()
    for x in df_tmp:
        tweets.append(x)
        types.append(i)
unique_df = pd.DataFrame({"tweet":tweets, "type":types})
unique_df.head()

In [None]:
for i in range(5) :
    print(f"We have {unique_df[unique_df['type']==i].nunique()} distinc samples for class {i}")

In [None]:
for i in range(5) :
    print(f"We have {len(unique_df[unique_df['type']==i])}  samples for class {i}")

In [None]:
unique_df.to_csv("Train_data.csv", index=False)

In [None]:
Train_data_file = "Train_data.csv"

In [None]:
df = pd.read_csv(Train_data_file)

In [None]:
print(f"We have {len(df)} Training samples")
print(f"We have {len(test_df)} Test samples")
print(f"We have {df['type'].nunique()} class")

In [None]:
classes = df['type'].unique()
for c in classes : 
    print(c)

#### Class distribution

In [None]:
sns.countplot(df["type"])

In [None]:
df["char_count"] = df["tweet"].apply(len)
df["word_count"] = df["tweet"].apply(lambda x:len(list(x.split())))

In [None]:
sns.violinplot(x="type", y="word_count", data=df)

In [None]:
sns.violinplot(x="type", y="char_count", data=df)

In [None]:
idtoclass = ['sexual_violence', 'Physical_violence', 'emotional_violence',
       'Harmful_Traditional_practice', 'economic_violence']
classtoid = {idtoclass[i]:i for i in range(len(idtoclass))}
print(idtoclass)
print(classtoid)

In [None]:
df.head()

## Model

In [None]:
data = pd.read_csv(Train_data_file)

In [None]:
import nltk
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


nltk.download("punkt")


In [None]:
df = pd.read_csv(Train_data_file)

X_train , X_val, y_train, y_val= train_test_split(df["tweet"].values,df["type"].values ,train_size=0.9,stratify = df["type"].values, random_state=0)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)


In [None]:
sns.histplot(y_val)


In [None]:
cvt = CountVectorizer(
    tokenizer = word_tokenize,
    token_pattern=None
)

cvt.fit(X_train)
Xtrain_cvt = cvt.transform(X_train)
Xval_cvt = cvt.transform(X_val)


ifidf = TfidfVectorizer(
    tokenizer = word_tokenize,
    token_pattern=None
)

ifidf.fit(X_train)
Xtrain_tfidf = ifidf.transform(X_train)
Xval_tfidf = ifidf.transform(X_val)

In [None]:
n_components = 10000
PCA_cvt = TruncatedSVD(n_components)
PCA_cvt.fit(Xtrain_cvt)
Xtrain_cvt_reduced = PCA_cvt.transform(Xtrain_cvt)
Xval_cvt_reduced = PCA_cvt.transform(Xval_cvt)

### Logistic regression with count vectorizer

In [None]:
from sklearn.linear_model import LogisticRegression



lr = LogisticRegression(max_iter=200, random_state=42)


lr.fit(Xtrain_cvt, y_train)

preds = lr.predict(Xval_cvt)

accuracy = metrics.accuracy_score(y_val, preds)

train_preds = lr.predict(Xtrain_cvt)
train_acc = metrics.accuracy_score(y_train, train_preds)

print(f"Train Accuracy : {train_acc} - Validation Accuracy : {accuracy}")

print("Train Confusion matrix")
print(metrics.confusion_matrix(y_train, train_preds))

print("Validation Confusion matrix")
print(metrics.confusion_matrix(y_val, preds))


In [None]:
test_df = pd.read_csv("test_data.csv")
Xtest = cvt.transform(test_df.tweet)
preds = lr.predict(Xtest)
submission = pd.read_csv("SampleSubmission.csv")
submission["type"] = preds
submission["type"] = submission["type"].apply(lambda x: idtoclass[int(x)])
print(submission.head())
submission.to_csv("submission.csv", index=False)

### Training multiple LogRrg models and then take the majority probability

Didn't work as well as one Log Reg

In [None]:
from sklearn.linear_model import LogisticRegression

nb_models = 3


lrs = [LogisticRegression(max_iter=100, random_state=i, solver = "liblinear") for i in range(nb_models)] 

for lr in lrs:
    lr.fit(Xtrain_cvt, y_train)


preds_l = [lr.predict_proba(Xval_cvt) for lr in lrs]
preds_l = np.array(preds_l)

preds_l = np.sum(preds_l, axis=0)/nb_models

preds = np.argmax(preds_l, axis=1)
accuracy = metrics.accuracy_score(y_val, preds)

print(f"Accuracy : {accuracy}")

print(metrics.confusion_matrix(y_val, preds))


In [None]:
test_df = pd.read_csv("test_data.csv")
Xtest = ifidf.transform(test_df.tweet)
preds_l = [lr.predict_proba(Xtest) for lr in lrs]
preds_l = np.array(preds_l)

preds_l = np.sum(preds_l, axis=0)/nb_models

preds = np.argmax(preds_l, axis=1)
submission = pd.read_csv("SampleSubmission.csv")
submission["type"] = preds
submission["type"] = submission["type"].apply(lambda x: idtoclass[int(x)])
print(submission.head())
submission.to_csv("submission.csv", index=False)

### Logistic regression with TF-IDF
Worse than Log reg with CVT

In [None]:
from sklearn.linear_model import LogisticRegression


lr = LogisticRegression(max_iter=200, random_state=42)

lr.fit(Xtrain_tfidf, y_train)

preds = lr.predict(Xval_tfidf)

accuracy = metrics.accuracy_score(y_val, preds)

print(f"Accuracy : {accuracy}")


In [None]:
test_df = pd.read_csv("test_data.csv")
Xtest = ifidf.transform(test_df.tweet)
preds = lr.predict(Xtest)
submission = pd.read_csv("SampleSubmission.csv")
submission["type"] = preds
submission["type"] = submission["type"].apply(lambda x: idtoclass[int(x)])
print(submission.head())
submission.to_csv("submission.csv", index=False)

### Random forest with count vectorizer

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=250, max_depth=30)

rf.fit(Xtrain_cvt, y_train)

preds = rf.predict(Xval_cvt)

accuracy = metrics.accuracy_score(y_val, preds)

print(f"Accuracy : {accuracy}")

print(metrics.confusion_matrix(y_val, preds))


In [None]:
test_df = pd.read_csv("test_data.csv")
Xtest = ifidf.transform(test_df.tweet)
preds = rf.predict(Xtest)
submission = pd.read_csv("SampleSubmission.csv")
submission["type"] = preds
submission["type"] = submission["type"].apply(lambda x: idtoclass[int(x)])
print(submission.head())
submission.to_csv("submission.csv", index=False)

## Deep learning approach

In [None]:
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
! unzip crawl-300d-2M.vec.zip

In [None]:
def load_embeddings(word_index, embedding_file, vector_length=300):
  max_features = len(word_index) + 1
  words_to_find = list(word_index.keys())
  more_words_to_find = []

  for wtf in words_to_find:
    more_words_to_find.append(wtf)
    more_words_to_find.append(str(wtf).capitalize())

  more_words_to_find = set(more_words_to_find)

  def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


  embeddings_index = dict(
    get_coefs(*o.strip().split(" "))
    for o in open(embedding_file)
    if o.split(" ")[0]
    in more_words_to_find
    and len(o) > 100
  )
  embedding_matrix = np.zeros((max_features, vector_length))

  for word, i in word_index.items():
    if i >= max_features:
      continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None:
      embedding_vector = embeddings_index.get(
        str(word).capitalize()
      )
    if embedding_vector is None:
      embedding_vector = embeddings_index.get(
        str(word).upper()
      )
    if (embedding_vector is not None
      and len(embedding_vector) == vector_length):
      embedding_matrix[i] = embedding_vector
  return embedding_matrix


In [None]:
class GBVDataset:
  def __init__(self, tweets, targets = None, kind = "train"):
    self.tweets = tweets
    self.kind = kind
    if kind == "train" :
      self.target = targets

  def __len__(self):
    return len(self.tweets)

  def __getitem__(self, item):

    tweet = self.tweets[item, :]
    if self.kind == "train" :
      target = self.target[item]
      return {
          "tweet" : torch.tensor(tweet, dtype = torch.long),
          "target" : torch.tensor(target, dtype=torch.float)
      }
    else : 
      return {
          "tweet" : torch.tensor(tweet, dtype = torch.long),
      }


In [None]:
class MEAN(nn.Module):
  def __init__(self, embedding_matrix):
    super(MEAN, self).__init__()
    num_words = embedding_matrix.shape[0]
    embed_dim = embedding_matrix.shape[1]

    self.embedding = nn.Embedding(
        num_embeddings = num_words,
        embedding_dim = embed_dim
    )

    self.embedding.weight.requires_grad=False

    self.out = nn.Linear(300, 5)

  def forward(self, x):
    x = self.embedding(x)
    
    out = torch.mean(x, axis=1)
    
    out = self.out(out)

    return out

In [None]:
class LSTM(nn.Module):
  def __init__(self, embedding_matrix):
    super(LSTM, self).__init__()
    num_words = embedding_matrix.shape[0]
    embed_dim = embedding_matrix.shape[1]

    self.embedding = nn.Embedding(
        num_embeddings = num_words,
        embedding_dim = embed_dim
    )

    self.embedding.weight.requires_grad=False

    self.lstm = nn.LSTM(
        embed_dim, 
        128, 
        bidirectional=True, 
        batch_first = True
    )

    self.out = nn.Linear(512, 5)

  def forward(self, x):
    x = self.embedding(x)
    
    x, _ = self.lstm(x)

    avg_pool = torch.mean(x,1)
    max_pool , _ = torch.max(x,1)

    out = torch.cat((avg_pool , max_pool),1)

    out = self.out(out)

    return out

In [None]:
def train(data_loader, model , optimizer, device):
    model.train()
    fullloss  = 0.0
    nb = 0
    for data in data_loader:
        tweets = data["tweet"]
        targets = data["target"]

        tweets = tweets.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)


        optimizer.zero_grad()

        predictions = model(tweets)
        loss = nn.CrossEntropyLoss()(
            predictions, 
            targets.long()
        )
        fullloss += loss.item()*tweets.shape[0]
        nb += tweets.shape[0]

        loss.backward()
        optimizer.step()
    return fullloss/nb

def evaluate(data_loader, model , device):
    final_predictions = []
    final_targets = []

    model.eval()

    with torch.no_grad():
        for data in data_loader : 
            tweets = data["tweet"]
            targets = data["target"]

            tweets = tweets.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            predictions = model(tweets)

            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()

            final_predictions.extend(predictions)
            final_targets.extend(targets)
    return final_predictions, final_targets


In [None]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10

df = pd.read_csv(Train_data_file)

xtrain , xtest, ytrain, ytest = train_test_split(df.tweet.values, df.type.values, test_size=0.3, random_state=42, stratify = df.type.values)

tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df.tweet.values.tolist())

xtrain = tokenizer.texts_to_sequences(xtrain)

xtest = tokenizer.texts_to_sequences(xtest)

xtrain = tf.keras.preprocessing.sequence.pad_sequences(
    xtrain, maxlen=MAX_LEN
)

xtest = tf.keras.preprocessing.sequence.pad_sequences(
    xtest, maxlen=MAX_LEN
)

train_dataset = GBVDataset(
    tweets = xtrain, 
    targets = ytrain
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, 
    batch_size = TRAIN_BATCH_SIZE,
    num_workers = 2
)

valid_dataset = GBVDataset(
    tweets = xtest, 
    targets = ytest
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, 
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1
)

print("Loading embeddings")

embedding_matrix = load_embeddings(tokenizer.word_index,"crawl-300d-2M.vec")

device = torch.device("cuda")

model = LSTM(embedding_matrix)

model.to(device)

optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3,gamma=0.1)

print("Training Model")

best_accuracy = 0

early_stopping_counter = 0

best_model_dict = model.state_dict()


for epoch in range(EPOCHS):
    loss = train(train_data_loader, model , optimizer, device)
    scheduler.step()
    outputs, targets = evaluate(
      valid_data_loader, model , device
    )

    outputs_target = np.argmax(np.array(outputs), axis=1)

    accuracy = metrics.accuracy_score(targets, outputs_target)


    print(
      f"Epoch : {epoch},loss = {loss}, Accuracy Score = {accuracy}"
    )

    if accuracy > best_accuracy : 
        best_model_dict = model.state_dict()
        best_accuracy = accuracy
    else:
        early_stopping_counter+=1

    if early_stopping_counter>2:
        break

torch.save(best_model_dict,f"model.pt")


In [None]:
model.load_state_dict(best_model_dict)

In [None]:
def generate_pred(model , data_loader, device):
    final_predictions = []

    model.eval()

    with torch.no_grad():
        for data in data_loader : 
            tweets = data["tweet"]

            tweets = tweets.to(device, dtype=torch.long)

            predictions = model(tweets)

            predictions = predictions.cpu().numpy().tolist()

            final_predictions.extend(predictions)
    return np.argmax(final_predictions,1)


In [None]:
test_df = pd.read_csv("test_data.csv")

xtest = tokenizer.texts_to_sequences(test_df.tweet.values)

xtest = tf.keras.preprocessing.sequence.pad_sequences(
    xtest, maxlen=MAX_LEN
)

test_dataset = GBVDataset(
    tweets = xtest, 
    kind="test"
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size = VALID_BATCH_SIZE,
    num_workers = 1
)

outputs = generate_pred(
    model,test_data_loader, device
)

In [None]:
sample_submission = pd.read_csv("SampleSubmission.csv")
sample_submission["type"] = outputs
sample_submission["type"] = sample_submission["type"].apply(lambda x: idtoclass[int(x)])
sample_submission.to_csv('submission.csv', index=False)

## Transformers

In [None]:
import transformers

In [None]:
class config :
    MAX_LEN = 64

    TRAIN_BATCH_SIZE  = 64
    VALID_BATCH_SIZE = 64

    EPOCHS = 10

    MODEL_PATH = "model.bin"

    TRAINING_FILE = "Train_data.csv"

    TOKENIZER = transformers.BertTokenizer.from_pretrained(
      "bert-base-uncased",
      do_lower_case = True
    )


In [None]:
import torch

class BERTDataset:
  def __init__(self, tweet, target=None):
    self.tweet = tweet
    self.target = target
    self.tokenizer = config.TOKENIZER
    self.max_len = config.MAX_LEN

  def __len__(self):
    return len(self.tweet)

  def __getitem__(self, item):
    tweet = str(self.tweet[item])
    tweet = " ".join(tweet.split())

    inputs = self.tokenizer.encode_plus(
      tweet,
      None,
      add_special_tokens=True,
      max_length=self.max_len,
      pad_to_max_length=True,
    )


    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]
    
    if not self.target is None : 
        
        return {
            "ids":torch.tensor(
                ids, dtype=torch.long
                ),
            "mask" : torch.tensor(
                mask, dtype=torch.long
            ), 
            "token_type_ids" : torch.tensor(
                token_type_ids, dtype = torch.long
            ),
            "targets" : torch.tensor(
                self.target[item],
                dtype = torch.float
            )
        }
    else :
        return {
            "ids":torch.tensor(
                ids, dtype=torch.long
                ),
            "mask" : torch.tensor(
                mask, dtype=torch.long
            ), 
            "token_type_ids" : torch.tensor(
                token_type_ids, dtype = torch.long
            )
        }
        

In [None]:
import torch.nn as nn

class BERTBaseUncased(nn.Module):
  def __init__(self):
    super(BERTBaseUncased, self).__init__()
    self.bert = transformers.BertModel.from_pretrained("bert-base-uncased")
    self.bert_drop = nn.Dropout(0.3)

    self.out = nn.Linear(768, 5)

  def forward(self, ids, mask , token_type_ids):

    _, o2 = self.bert(
        ids, 
        attention_mask=mask, 
        token_type_ids = token_type_ids, 
        return_dict=False
    )

    bo = self.bert_drop(o2)

    output = self.out(bo)

    return output

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets.long())

In [None]:
from tqdm import tqdm_notebook, tqdm
def train_fn(data_loader, model , optimizer, device, scheduler):
    model.train()
    for d in tqdm(data_loader) : 
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]


        ids = ids.to(device, dtype=torch.long)

        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        mask = mask.to(device, dtype=torch.long)

        targets = targets.to(device, dtype=torch.float)


        optimizer.zero_grad()

        outputs = model(
            ids = ids, 
            mask = mask, 
            token_type_ids = token_type_ids
        )

        loss = loss_fn(outputs, targets)

        loss.backward()

        optimizer.step()

        scheduler.step()

def eval_fn(data_loader, model , device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for d in tqdm(data_loader) : 
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]


            ids = ids.to(device, dtype=torch.long)

            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            mask = mask.to(device, dtype=torch.long)

            targets = targets.to(device, dtype=torch.float)
            outputs = model(
              ids = ids, 
              mask = mask, 
              token_type_ids = token_type_ids
            )
            targets = targets.cpu().detach()
            fin_targets.extend(targets.numpy().tolist())

            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs.numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
from transformers import AdamW
from sklearn import model_selection , metrics
from transformers import get_linear_schedule_with_warmup


dfx  = pd.read_csv(config.TRAINING_FILE)


df_train , df_valid = model_selection.train_test_split(
  dfx, 
  test_size = 0.1, 
  random_state =42,
  stratify = dfx.type.values
)

df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)


train_dataset = BERTDataset(
  tweet = df_train.tweet.values, 
  target = df_train.type.values
)

train_data_loader = torch.utils.data.DataLoader(
  train_dataset, 
  batch_size = config.TRAIN_BATCH_SIZE,
  num_workers = 4
)

valid_dataset = BERTDataset(
  tweet = df_valid.tweet.values, 
  target = df_valid.type.values
)

valid_data_loader = torch.utils.data.DataLoader(
  valid_dataset, 
  batch_size = config.VALID_BATCH_SIZE,
  num_workers = 4
)

device = torch.device("cuda")

model = BERTBaseUncased()
model.to(device)

param_optimizer = list(model.named_parameters())

no_decay = ["bias","LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
 {
     "params":[
               p for n,p in param_optimizer if
               not any(nd in n for nd in no_decay)
     ],
    "weight_decay":0.001,
 } ,
 {
     "params":[
               p for n,p in param_optimizer if
               any(nd in n for nd in no_decay)
     ],
    "weight_decay":0.0,
 } 

]

num_train_steps = int(
  len(df_train)/config.TRAIN_BATCH_SIZE  * config.EPOCHS
)

optimizer = AdamW(optimizer_parameters, lr=3e-5)

scheduler = get_linear_schedule_with_warmup(
  optimizer, 
  num_warmup_steps = 0,
  num_training_steps = num_train_steps
)

model = nn.DataParallel(model)

best_accuracy = 0

for epoch in range(config.EPOCHS):
    train_fn(
        train_data_loader, model , optimizer, device, scheduler
    )

    outputs , targets = eval_fn(
        valid_data_loader, model , device
    )

    outputs_targets = np.argmax(np.array(outputs), axis=1)

    accuracy = metrics.accuracy_score(targets, outputs_targets)

    print(f"Accuracy Score = {accuracy}")

    if accuracy>best_accuracy:
        torch.save(model.state_dict(), config.MODEL_PATH)
        best_accuracy = accuracy



In [None]:
def generate_pred(model , data_loader, device):
    final_predictions = []

    model.eval()

    with torch.no_grad():
        for d in tqdm(data_loader) : 
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]

            ids = ids.to(device, dtype=torch.long)

            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            mask = mask.to(device, dtype=torch.long)

            predictions =  model(
              ids = ids, 
              mask = mask, 
              token_type_ids = token_type_ids
            )

            predictions = predictions.cpu().detach().numpy().tolist()

            final_predictions.extend(predictions)
    return np.argmax(final_predictions,1)

In [None]:
test_df = pd.read_csv("test_data.csv")

test_dataset = BERTDataset(
      tweet = test_df.tweet.values, 
    )

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, 
    batch_size = 64,
    num_workers = 1
)

outputs = generate_pred(
    model,test_data_loader, device
)

In [None]:
sample_submission = pd.read_csv("SampleSubmission.csv")
sample_submission["type"] = outputs
sample_submission["type"] = sample_submission["type"].apply(lambda x: idtoclass[int(x)])
sample_submission.to_csv('submission.csv', index=False)