# Install Libraries

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!pip install fasttext
!pip install stanza
!pip install -q transformers

# Import Libraries

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import mplcursors
from sklearn.model_selection import train_test_split
import os
import shutil
import numpy as np
import pandas as pd
import re
import nltk
import stanza
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import fasttext
from tensorflow.keras.models import load_model
from transformers import pipeline
import json
from numpy.linalg import norm

# Load Models from Previous Phase

In [None]:
ft_model = fasttext.load_model("/gdrive/MyDrive/Project/train_100_single_epoch50.bin")
filter_model = load_model("/gdrive/MyDrive/Project/FastTextFilterModel.bin")
classification_model = load_model("/gdrive/MyDrive/Project/FastTextClassificationModel.bin")
sentiment_model = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")

# Data Preprocessing

In [None]:
def tokenize_data(data):
  nlp = stanza.Pipeline(lang='en', processors='tokenize')
  tokenized_data = []
  for i in range(0, len(data)):
    doc = data[i]
    doc = nlp(doc)
    doc = [str(token.text) for sent in doc.sentences for token in sent.tokens]
    doc = ' '.join(doc)
    tokenized_data.append(doc)
  return tokenized_data

In [None]:
def whitespace_tokenizer(sent):
  return sent.split()

In [None]:
def get_pos_tags(data):
  nlp = stanza.Pipeline(lang='en', processors='pos, tokenize')
  POS_tags = []
  for i in range(0, len(data)):
    doc = data[i]
    doc = nlp(doc)
    tags= [str(word.pos) for sent in doc.sentences for word in sent.words]
    POS_tags.append(tags)
  return POS_tags

In [None]:
def filter_reviews(reviews):
  predictions = filter_model.predict(get_fastText_embedding(reviews))
  filtered_reviews = []
  for prediction, review in zip(predictions, reviews):
    if(np.argmax(prediction) == 1):
      filtered_reviews.append(review)
  return filtered_reviews

# Helper Functions

In [None]:
def get_pos_weights(POS_tags, weights = [1, 1, 0]):
  weights = []
  for sentence_tags in POS_tags:
    sentence_weights = []
    for tag in sentence_tags:
      if(tag == "VERB"):
        sentence_weights.append(weights[0])
      elif(tag == "NOUN"):
        sentence_weights.append(weights[1])
      else:
        sentence_weights.append(weights[2])
    weights.append(sentence_weights)
  return weights

In [None]:
def cosine_similarity(A,B):
  ans = np.dot(A,B)/(norm(A)*norm(B))
  return ans

In [None]:
def match_requirement_reviews(reviews, requirements, threshold = 0.75):
  tokenized_reviews = tokenize_data(reviews)
  tokenized_requirements = tokenize_data(requirements)
  reviews_pos_tags = get_pos_tags(tokenized_reviews)
  req_pos_tags = get_pos_tags(tokenized_requirements)
  reviews_weights = get_pos_weights(reviews_pos_tags)
  req_weights = get_pos_weights(req_pos_tags)
  app_reviews_embeddings = get_pos_weighted_fastText_embedding(tokenized_reviews, reviews_weights)
  app_requirements_embeddings = get_pos_weighted_fastText_embedding(tokenized_requirements, req_weights)
  req_review_dict = {}
  for i, review in enumerate(app_reviews_embeddings):
    for j, req in enumerate(app_requirements_embeddings):
      if i >= j:
        continue
      similarity = cosine_similarity(review, req)
      if(similarity > threshold):
        if(requirements[j] not in req_review_dict.keys()):
          req_review_dict.update({requirements[j] : [reviews[i]]})
        else:
          req_review_dict[requirements[j]].append(reviews[i])
  return req_review_dict

In [None]:
def classify_requirements(req_review_dict):
  req_types_dict = {}
  for item in req_review_dict.items():
    predictions = classification_model.predict(get_fastText_embedding(item[1]))
    prediction_labels = []
    for prediction in predictions:
      prediction_labels.append(np.argmax(prediction))
    #FR:0, PD:1, UE:2
    if(1 in prediction_labels):
      req_types_dict.update({item[0] :"PD"})
    elif(0 in prediction_labels):
      req_types_dict.update({item[0] :"FR"})
    else:
      req_types_dict.update({item[0] :"Other"})
  return req_types_dict

In [None]:
def classify_reviews(reviews):
    rev_types_dict = {}
    predictions = classification_model.predict(get_fastText_embedding(reviews))
    prediction_labels = []
    for prediction in predictions:
      prediction_labels.append(np.argmax(prediction))
    for review, prediction in zip(reviews, prediction_labels):
      #FR:0, PD:1, UE:2
      if prediction == 0:
        rev_types_dict.update({review : "Feature Request"})
      elif prediction == 1:
        rev_types_dict.update({review : "Bug Report"})
      else:
        rev_types_dict.update({review :"Information Seeking or Giving"})
    return rev_types_dict

In [None]:
def get_requirements_sentiments_score(req_review_dict):
  req_sent_dict = {}
  for item in req_review_dict.items():
    results = sentiment_model(item[1])
    sentiment = 0
    for result in results:
      if(result['label'] == 'POS'):
        sentiment += result['score'] * 0.5
      elif(result['label'] == 'NEG'):
        sentiment += result['score']
    req_sent_dict.update({item[0] : sentiment/len(item[1])})
  return req_sent_dict

In [None]:
def estimate_importance(req_review_dict, req_types_dict, req_sent_dict):
  req_importance_dict = {}
  max_mentions = 0
  for item in req_review_dict.items():
    if(len(item[1]) > max_mentions):
      max_mentions = len(item[1])
  for item in req_review_dict.items():
    score = 0
    score += len(item[1])/max_mentions
    score += req_sent_dict[item[0]]
    if(req_types_dict[item[0]] == "FR"):
      score += 0.5
    elif (req_types_dict[item[0]] == "PD"):
      score += 1
    else:
      score += 0.25
    req_importance_dict.update({item[0] : score })
  return req_importance_dict

In [None]:
def print_requirements_sorted(req_importance_dict):
  sorted_req = sorted(req_importance_dict.items(),key=lambda x:x[1],reverse=True)
  for item in sorted_req:
    print(item[0], item[1])
    print("----------------------------")

# Get Representaion

In [None]:
def get_fastText_embedding(data):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension()), dtype = 'float32')
  for i, review in enumerate(data):
    review_embedding = np.zeros(shape=(ft_model.get_dimension(),), dtype = 'float32')
    words_count = 0
    for word in review.lower().split():
      words_count = words_count + 1
      word_embedding = ft_model.get_word_vector(word).astype('float32')
      review_embedding = review_embedding + word_embedding
    review_embedding = review_embedding/words_count
    embeddings[i] = review_embedding
  return embeddings

In [None]:
def get_pos_weighted_fastText_embedding(data, weights):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension()), dtype = 'float32')
  for i, review in enumerate(data):
    review_embedding = np.zeros(shape=(ft_model.get_dimension(),), dtype = 'float32')
    weights_sum = 0
    for j, word in enumerate(review.split()):
      weights_sum = weights_sum + weights[i][j]
      word_embedding = ft_model.get_word_vector(word).astype('float32') * weights[i][j]
      review_embedding = review_embedding + word_embedding
    if(weights_sum != 0):
      review_embedding = review_embedding/weights_sum
    embeddings[i] = review_embedding
  return embeddings

# Implement The Phase

## Setup

In [None]:
# Import Reviews, Requirements files
app_name = "Messenger"
file = open(f'{app_name}_reviews.txt', "r")
data = file.read()
reviews = data.split("\n")
file = open(f'{app_name}_requirements.txt', "r")
data = file.read()
requirements = data.split("\n")
filtered_reviews = filter_reviews(reviews)
review_types_dict = classify_reviews(filtered_reviews)

# find Feature Request Reviews that are not matched in the previous phase
discussed_reviews = []
for item in req_review_dict.items():
  for review in item[1]:
    discussed_reviews.append(review)
fr_reviews = [review for review in reviews if review not in unique(discussed_reviews) and review_types_dict[review] == "Feature Request" ]
fr_embeddings = get_fastText_embedding(fr_reviews)

## Estimate Importance

In [None]:
req_review_dict = match_requirement_reviews(filtered_reviews, requirements)
req_types_dict = classify_requirements(req_review_dict)
req_sent_dict = get_requirements_sentiments_score(req_review_dict)
req_importance_dict = estimate_importance(req_review_dict, req_types_dict, req_sent_dict)
print_requirements_sorted(req_importance_dict)

## Suggesting New Requirements

### Implement

In [None]:
kmeans = KMeans(n_clusters = 5, random_state=0).fit(fr_embeddings)
new_groups = []
for i in range(5):
  group = []
  for review, label in zip(fr_reviews, kmeans.labels_):
    if(label == i):
      group.append(review)
  new_groups.append(group)
groups_keywords = []
for group in new_groups:
  keywords ={}
  for item in group:
    print(item)
    words = item.split()
    for word in words:
      if(word not in keywords.keys()):
        if (word not in stopwords):
          keywords.update({word : 1})
      else:
        keywords.update({word : keywords[word] + 1})
  keywords = sorted(keywords.items(),key=lambda x:x[1],reverse=True)
  keywords = [keyword[0] for keyword in keywords]
  groups_keywords.append(keywords[:3])
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(fr_embeddings)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['dim1', 'dim2'])
finalDf = pd.concat([principalDf, pd.DataFrame(kmeans.labels_, columns =["label"])], axis = 1)

### Print Suggested Requirements Keywords

In [None]:
print("Suggested Requirements Keywords")
for group in groups_keywords:
  print(group)
  print("--------------")

### Plot Suggested Requirements Clusters

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(fr_embeddings)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, pd.DataFrame(kmeans.labels_, columns =["label"])], axis = 1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

targets = [0, 1, 2, 3, 4, 5, 6, 7]
colors = ['r', 'g', 'b', 'c', 'm', 'y' ,'k', '#aaaaaa']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['label'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color)
ax.legend(targets)
ax.grid()