<a href="https://colab.research.google.com/github/Gaurav7004/PDFs/blob/main/AutoQualitativeResearchData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !python -m spacy download en_core_web_lg

In [None]:
!pip install -q transformers  rouge-score sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# get mean pooling for sentence bert models 
# ref https://www.sbert.net/examples/applications/computing-embeddings/README.html#sentence-embeddings-with-transformers
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
class SentenceBertClass(torch.nn.Module):
    def __init__(self, model_name="sentence-transformers/paraphrase-MiniLM-L3-v2"):
        super(SentenceBertClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(384*3, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
        self.classifierSigmoid = torch.nn.Sigmoid()

    def forward(self, sent_ids, doc_ids, sent_mask, doc_mask):

        sent_output = self.l1(input_ids=sent_ids, attention_mask=sent_mask) 
        sentence_embeddings = mean_pooling(sent_output, sent_mask) 

        doc_output = self.l1(input_ids=doc_ids, attention_mask=doc_mask) 
        doc_embeddings = mean_pooling(doc_output, doc_mask)

        # elementwise product of sentence embs and doc embs
        combined_features = sentence_embeddings * doc_embeddings  

        # get concat of both features and elementwise product
        feat_cat = torch.cat((sentence_embeddings, doc_embeddings, combined_features), dim=1)  
        
        pooler = self.pre_classifier(feat_cat) 
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.classifierSigmoid(output) 

        return output

In [None]:
#model path .. modify to location of trained model file 
model_path = "/content/minilm_bal_exsum.pth"

In [None]:
pwd

'/content'

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

extractive_model = SentenceBertClass() 
extractive_model.load_state_dict(torch.load(model_path, map_location=torch.device(device) ))
extractive_model.eval();

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L3-v2')

# tokenize text as required by BERT based models
def get_tokens(text, tokenizer):
  inputs = tokenizer.batch_encode_plus(
            text, 
            add_special_tokens=True,
            max_length=512,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
  ids = inputs['input_ids']
  mask = inputs['attention_mask']
  return ids, mask

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# get predictions given some an array of sentences and their corresponding documents
def predict(model,sents, doc):
  sent_id, sent_mask = get_tokens(sents,tokenizer)
  sent_id, sent_mask = torch.tensor(sent_id, dtype=torch.long),torch.tensor(sent_mask, dtype=torch.long)
 
  doc_id, doc_mask = get_tokens([doc],tokenizer)
  doc_id, doc_mask = doc_id * len(sents), doc_mask* len(sents)
  doc_id, doc_mask = torch.tensor(doc_id, dtype=torch.long),torch.tensor(doc_mask, dtype=torch.long)

  preds = model(sent_id, doc_id, sent_mask, doc_mask)
  return preds

In [None]:
def summarize(doc, model, min_sentence_length=14, top_k=3, batch_size=3):
  doc = doc.replace("\n","")
  doc_sentences = []
  for sent in nlp(doc).sents:
    if len(sent) > min_sentence_length: 
      doc_sentences.append(str(sent))
  
  doc_id, doc_mask = get_tokens([doc],tokenizer)
  doc_id, doc_mask = doc_id * batch_size, doc_mask* batch_size
  doc_id, doc_mask = torch.tensor(doc_id, dtype=torch.long),torch.tensor(doc_mask, dtype=torch.long)

  scores = [] 
  # run predictions using some batch size
  for i in tqdm(range(int(len(doc_sentences) / batch_size) + 1)):
    batch_start = i*batch_size  
    batch_end = (i+1) * batch_size if (i+1) * batch_size < len(doc) else len(doc)-1
    batch = doc_sentences[batch_start: batch_end]
    if batch:
      preds = predict(model, batch, doc) 
      scores = scores + preds.tolist() 
 
  sent_pred_list = [{"sentence": doc_sentences[i], "score": scores[i][0], "index":i} for i in range(len(doc_sentences))]
  sorted_sentences = sorted(sent_pred_list, key=lambda k: k['score'], reverse=True) 

  sorted_result = sorted_sentences[:top_k] 
  sorted_result = sorted(sorted_result, key=lambda k: k['index']) 
  
  summary = [ x["sentence"] for x in sorted_result]
  summary = " ".join(summary)

  return summary, scores, doc_sentences

In [None]:
article1 = """ Some girls’ runaway for marriage, so this provides a wrong impression on the other girls. This leads to the parents not sending their girls outside. """
article2 = """ There are some families they do marriage of girls between the ages of 14 to 19. Many girls get married by the time they reach the age of 16 to 18. """
article3 = """ When girls go out of the house, they get to hear a lot from the people of the society, like see how these girls keep roaming. But girls do not need to pay attention to all these things. What about the people, they will say something or the other? They should do whatever they want to do, no matter what the world thinks of them.  """
article4 = """ This women group organization helps. Apart from this, women also get social after marriage in the village. When they become social, then there is a women group in the village in which 8-10 or may be more women are attached to it. When the girls tell them that you guys come because my in-laws are misbehaving with me in this way. Husband is also beating, in-laws are also ignoring me then this group and women organization supports. We have a women's organization here, apart from that these women who run the group, all these people go and support them.  """
article5 = """ There is also a risk that if she becomes a mother during this time, then her problems increase even more. It is very challenging for her, as she is not mature physically and mentally due to her young age. So this becomes a big problem for her. """
article6 = """ Apart from these things there is social pressure. In spite being correct society judges them wrongly, try to prove them wrong. People from society put unnecessary pressure on them even if they are not wrong at their place. These things increase girls’ problems as they will not be able to express them freely out of hesitation. They keep things within and don't prefer to share it with anyone.  """
article7 = """ Main problem is related to their age and then they are not able to go far places, they are not allowed to go alone. They feel this thing that why we are not allowed to go out in the same way as boys. Gradian keep pressurizing them that don't go alone at far places, don't go here, don't go there. """


In [None]:
summary, scores, sentences = summarize(article3, extractive_model, min_sentence_length=14, top_k=3, batch_size=4)
article = summary
article

  0%|          | 0/1 [00:00<?, ?it/s]

' When girls go out of the house, they get to hear a lot from the people of the society, like see how these girls keep roaming. They should do whatever they want to do, no matter what the world thinks of them.  '

In [None]:
import gensim
model = gensim.models.Word2Vec.load_word2vec_format('path-to-vectors.txt', binary=False)
# if you vector file is in binary format, change to binary=True
sentence = ["London", "is", "the", "capital", "of", "Great", "Britain"]
vectors = [model[w] for w in sentence]

In [None]:
!pip install transformers

In [None]:
# Importing dependencies from transformers
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [None]:
# Load tokenizer 
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [None]:
# Load model 
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [None]:
# Create tokens - number representation of our text
tokens = tokenizer(article, truncation=True, padding="longest", return_tensors="pt")

In [None]:
# Summarize 
summary = model.generate(**tokens)

In [None]:

# Decode summary
tokenizer.decode(summary[0])