In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 60.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 61.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
!git clone https://github.com/SpyzzVVarun/paraphrase_identification.git

Cloning into 'paraphrase_identification'...
remote: Enumerating objects: 68, done.[K
remote: Total 68 (delta 0), reused 0 (delta 0), pack-reused 68[K
Unpacking objects: 100% (68/68), done.
Checking out files: 100% (45/45), done.


Preparing the data

In [3]:
import pandas as pd
with open('/content/paraphrase_identification/dataset/msr-paraphrase-corpus/msr_paraphrase_test.txt') as f:
    lines = f.readlines()
  
data_lines = []
for line in lines:
  data_lines.append(line[:-1].split('\t'))

df = pd.DataFrame(data_lines)
df.columns = df.iloc[0]
df.drop(0, inplace = True)
df = df.rename(columns = {'#1 String':'Sen1', '#2 String':'Sen2'})
df = df.reset_index(drop = True)

In [4]:
df.head()

Unnamed: 0,﻿Quality,#1 ID,#2 ID,Sen1,Sen2
0,1,1089874,1089925,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,1,3019446,3019327,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,1,1945605,1945824,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,0,1430402,1430329,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,0,3354381,3354396,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...


In [5]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def predict(sentences, model, tokenizer):
  # Tokenize sentences
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)
  # Perform pooling
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
  # Normalize embeddings
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
  return np.dot(sentence_embeddings[0],sentence_embeddings[1])

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [12]:
y_predict = []
for i in range(df.shape[0]):
  sentences = [df.loc[i, 'Sen1'],df.loc[i, 'Sen2']]
  prediction = predict(sentences, model, tokenizer)
  y_predict.append(prediction)

In [33]:
y_real = df[df.columns[0]].astype(float).tolist()

In [40]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
THRESHOLD = [0.5,0.55, 0.6, 0.65, 0.7, 0.75, 0.8]
scores = pd.DataFrame()
scores['threshold'] = THRESHOLD
for threshold in THRESHOLD:
  y_predict_final = []
  for proba in y_predict:
    if proba >= threshold:
      y_predict_final.append(1.0)
    else:
      y_predict_final.append(0.0)
  scores.loc[scores['threshold'] == threshold,'precision'] = precision_score(y_real, y_predict_final)
  scores.loc[scores['threshold'] == threshold,'recall'] = recall_score(y_real, y_predict_final)
  scores.loc[scores['threshold'] == threshold,'f1'] = f1_score(y_real, y_predict_final)
  scores.loc[scores['threshold'] == threshold,'acc'] = accuracy_score(y_real, y_predict_final)

In [41]:
scores

Unnamed: 0,threshold,precision,recall,f1,acc
0,0.5,0.6875,0.987794,0.810733,0.693333
1,0.55,0.697311,0.972101,0.81209,0.70087
2,0.6,0.710475,0.952049,0.813711,0.710145
3,0.65,0.730929,0.918919,0.814214,0.721159
4,0.7,0.751343,0.853531,0.799184,0.714783
5,0.75,0.779631,0.774194,0.776903,0.704348
6,0.8,0.803456,0.648649,0.7178,0.66087


Code for the Streamlit Web App

In [None]:
!pip install streamlit
!pip install --upgrade protobuf 

In [None]:
%%writefile app.py
import streamlit as st
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

@st.cache(allow_output_mutation=True)
def load_tokenizer_and_model():
  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
  model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
  return tokenizer, model

with st.spinner('Tokenizer and Model are being loaded..'):
  tokenizer, model = load_tokenizer_and_model()

st.write("""
         # Paraphrase Detection
         """
          )

def predict(sentences, model, tokenizer):
  # Tokenize sentences
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

  # Compute token embeddings
  with torch.no_grad():
      model_output = model(**encoded_input)

  # Perform pooling
  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  # Normalize embeddings
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

  return np.dot(sentence_embeddings[0],sentence_embeddings[1])

src = st.text_input('Enter Source sentence')
tgt = st.text_input('Enter Target sentence')
bt = st.button("Do paraphrase identification")

sentences = [src, tgt]
if (bt):
  if predict(sentences, model, tokenizer) > 0.6:
    st.success("Paraphrasing") 
  else:
    st.success("Not Paraphrasing")

In [None]:
!streamlit run app.py

In [None]:
!pip install pyngrok

In [None]:
!ngrok authtoken #authtoken

In [None]:
!nohup streamlit run app.py &

In [None]:
from pyngrok import ngrok
url=ngrok.connect(port=8501)
url

In [None]:
!cat /content/nohup.out