# Install Libraries

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!pip install fasttext
!pip install stanza

# Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import shutil
import numpy as np
import re
import nltk
from  nltk.tokenize  import word_tokenize
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import fasttext
from tensorflow.keras.models import load_model
import json
from numpy.linalg import norm

# Load Models from Previous Phase

In [None]:
ft_model = fasttext.load_model("/gdrive/MyDrive/Project/train_100_single_epoch50.bin")
filter_model = load_model("/gdrive/MyDrive/Project/FilterModel.bin")



# Data Preprocessing

In [None]:
def tokenize_data(data):
  nlp = stanza.Pipeline(lang='en', processors='tokenize')
  tokenized_data = []
  for i in range(0, len(data)):
    doc = data[i]
    doc = nlp(doc)
    doc = [str(token.text) for sent in doc.sentences for token in sent.tokens]
    doc = ' '.join(doc)
    tokenized_data.append(doc)
  return tokenized_data

In [None]:
def whitespace_tokenizer(sent):
  return sent.split()

In [None]:
def get_pos_tags(data):
  nlp = stanza.Pipeline(lang='en', processors='pos, tokenize')
  POS_tags = []
  for i in range(0, len(data)):
    doc = data[i]
    doc = nlp(doc)
    tags= [str(word.pos) for sent in doc.sentences for word in sent.words]
    POS_tags.append(tags)
  return POS_tags

In [None]:
def stanza_tokenizer(doc):
  nlp = stanza.Pipeline(lang='en', processors='tokenize')
  doc = nlp(doc)
  return [str(token.text) for sent in doc.sentences for token in sent.tokens]

In [None]:
def filter_reviews(reviews):
  predictions = filter_model.predict(get_fastText_embedding(reviews))
  filtered_reviews = []
  for prediction, review in zip(predictions, reviews):
    if(np.argmax(prediction) == 1):
      filtered_reviews.append(review)
  return filtered_reviews

# Helper Functions

In [None]:
def calculate_idf(processed_data):
    count_vectorizer = CountVectorizer(tokenizer=whitespace_tokenizer, token_pattern = None, lowercase=False)
    count_vectorizer.fit(processed_data)
    tf_matrix = count_vectorizer.transform(processed_data)
    doc_freq = np.array(tf_matrix.astype(bool).sum(axis=0)).flatten()
    idf = np.log(len(processed_data) / (doc_freq))
    return idf.tolist(), count_vectorizer

In [None]:
def get_pos_weights(POS_tags, weights = [1, 1, 0.5]):
  weights = []
  for sentence_tags in POS_tags:
    sentence_weights = []
    for tag in sentence_tags:
      if(tag == "VERB"):
        sentence_weights.append(weights[0])
      elif(tag == "NOUN"):
        sentence_weights.append(weights[1])
      else:
        sentence_weights.append(weights[2])
    weights.append(sentence_weights)
  return weights

In [None]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(A,B):
  ans = np.dot(A,B)/(norm(A)*norm(B))
  return ans

In [None]:
def Union(lst1, lst2):
  final_list = list(set(lst1) | set(lst2))
  return final_list

In [None]:
def unique(list1):
  unique_list = []
  for x in list1:
      if x not in unique_list:
          unique_list.append(x)
  return unique_list

# Get Representaion

In [None]:
def get_pos_weighted_fastText_embedding(data, weights):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension()), dtype = 'float32')
  for i, review in enumerate(data):
    review_embedding = np.zeros(shape=(ft_model.get_dimension(),), dtype = 'float32')
    weights_sum = 0
    for j, word in enumerate(review.split()):
      weights_sum = weights_sum + weights[i][j]
      word_embedding = ft_model.get_word_vector(word).astype('float32') * weights[i][j]
      review_embedding = review_embedding + word_embedding
    if(weights_sum != 0):
      review_embedding = review_embedding/weights_sum
    embeddings[i] = review_embedding
  return embeddings

# Implement The Phase

## Setup

In [None]:
# Import Reviews, Requirements and Annotation files
apps_names = {"Messenger"}
apps_reviews = {}
for app_name in apps_names:
  file = open(f'{app_name}_reviews.txt', "r")
  data = file.read()
  reviews = data.split("\n")
  reviews = filter_reviews(reviews)
  apps_reviews.update({app_name: reviews})

apps_requirements = {}
for app_name in apps_names:
  file = open(f'{app_name}_requirements.txt', "r")
  data = file.read()
  requirements = data.split("\n")
  apps_requirements.update({app_name: requirements})

with open('annotation.txt') as json_file:
    annot = json.load(json_file)

# Get Reviews and Requirements Representaions
reviews_pos_tags = get_pos_tags(tokenized_messenger_reviews)
req_pos_tags = get_pos_tags(tokenized_messenger_requirements)
reviews_weights = get_pos_weights(reviews_pos_tags)
req_weights = get_pos_weights(req_pos_tags)
apps_reviews_embeddings = {}
apps_requirements_embeddings = {}
for app_name in apps_names:
  app_reviews_embeddings = get_pos_weighted_fastText_embedding(tokenized_messenger_reviews, reviews_weights)
  app_requirements_embeddings = get_pos_weighted_fastText_embedding(tokenized_messenger_requirements, req_weights)
  apps_reviews_embeddings.update({app_name: app_reviews_embeddings})
  apps_requirements_embeddings.update({app_name: app_requirements_embeddings})

## Expirements

In [None]:
# Try different matching-thresholds and evaluate precision, recall, f1-measure, f2-measure
thresholds = np.arange(0,1.05,0.05)
p_results = []
r_results = []
f1_results = []
f2_results = []
for th in thresholds:
  req_rev_matches = {}
  for app_name in apps_names:
    for item in sorted_apps[app_name]:
      if (item[1] < th):
        break
      if(item[0][0] not in req_rev_matches.keys()):
        req_rev_matches.update({item[0][0]:[item[0][1]]})
      else:
        req_rev_matches[item[0][0]].append(item[0][1])
  tp = 0
  tn = 0
  fp = 0
  fn = 0
  for review in apps_reviews["Messenger"]:
    if(review in req_rev_matches):
      list3 = req_rev_matches[review]
    else:
      list3 = []
    list1 = annot[review][0]
    list2 = annot[review][1]
    lis = list1 + list2
    for elem in list3:
      if elem in lis:
        tp += 1
      if (elem not in lis):
        fp+=1
    for elem in lis:
      if (elem not in list3):
        fn+=1
    tn = tn + (70 - len(Union(lis, list3)))
  precision = float('inf')
  recall = float('inf')
  f1 = float('inf')
  f2 = float('inf')
  if((tp + fp) != 0):
    precision = tp/(tp + fp)
  if(tp + fn):
    recall = tp/(tp + fn)
  if(precision+recall):
    f1 = 2*precision*recall/(precision+recall)
  if(4*precision + recall):
    f2 = 5*precision*recall/(4*precision + recall)
  p_results.append(precision)
  r_results.append(recall)
  f1_results.append(f1)
  f2_results.append(f2)

## Plot the Results

In [None]:
plt.plot(thresholds, p_results)
plt.plot(thresholds, r_results)
plt.plot(thresholds, f2_results)
plt.legend(["Precesion","Recall","F2-measure"])
plt.xlabel("threshold")

In [None]:
plt.plot(r_results,p_results,'-o')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.xticks(np.arange(0,1.1,0.1))
plt.yticks(np.arange(0,1.1,0.1))