### Imports

In [None]:
!python -m spacy download en_core_web_lg # NEED TO RESTART THE KERNAL AFTER DOWNLOADING

In [None]:
!pip install --upgrade scikit-learn

In [17]:
# You need a path to word2vec_model; e.g: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g
model_file_path = 'word2vec_model.bin'

USER_STORIES_NAMES = ['g13-planningpoker.txt', 'g12-camperplus.txt', 'g14-datahub.txt', 'g28-zooniverse.txt', 'g04-recycling.txt',
'g08-frictionless.txt', 'g24-unibath.txt','g02-federalspending.txt','g03-loudoun.txt']

OUTPUT_PATH = 'class_dataset.csv'
csv_file_path = 'gold-standard_classes.csv'  # Replace with the path to your CSV file that cotains one column named "x" and the values of this column are the classes from the gold standard
user_story_dataset = 'supermarket.txt' # user story dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
from scipy import stats as st
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
from nltk.stem.porter import *
import gensim
from gensim import models
from gensim.models import Word2Vec, KeyedVectors
import pickle as pickle
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import editdistance
from tqdm import tqdm
import nltk

stemmer = PorterStemmer()

nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load('en_core_web_lg')
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
!pip install plotly==4.14.3
import plotly.express as px
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('porter_test')

import csv
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords

# Download NLTK data (you can skip this if already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from gensim.models import KeyedVectors
word2vec = KeyedVectors.load(model_file_path)

In [7]:
def extract_compound_nouns(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)

    compound_nouns = []
    current_compound = []

    for word, pos in tagged_tokens:
        if pos.startswith('NN') and word.lower() not in stopwords.words('english'):
            current_compound.append(word)
        else:
            if len(current_compound) > 1:
                compound_nouns.append(' '.join(current_compound))
            current_compound = []

    return compound_nouns

def extract_nouns(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)

    # Filter out stopwords and get only nouns
    nouns = [word for word, pos in tagged_tokens if pos.startswith('NN') and word.lower() not in stopwords.words('english')]

    return nouns

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def write_to_csv(file_path, headers, data):
    with open(file_path, 'w', encoding='utf-8', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(headers)
        csv_writer.writerows(data)


### Create the target (0/1) coulmn

In [None]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Load CSV into DataFrame

df = pd.read_csv(csv_file_path)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Choose the column to perform stemming on
column_name = 'x'  # Replace with the name of your column
selected_column = df[column_name]

# Create a Porter Stemmer instance
porter_stemmer = PorterStemmer()

# Define a function to perform stemming on a single text
def perform_stemming(text):
    words = word_tokenize(text)
    stemmed_words = [porter_stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# Apply stemming to the selected column
df[column_name] = selected_column.apply(perform_stemming)
df_no_duplicates = df.drop_duplicates(subset=column_name)

# Display the DataFrame after stemming
print("\nDataFrame after stemming:")
print(df_no_duplicates)
df_no_duplicates.to_csv('new.csv')

In [None]:
entites_calculate = df_no_duplicates['x'].tolist()
sorted(entites_calculate)

### Stop words

In [12]:
stop_words = set(stopwords.words('english')) # getting the english stopwords set from nltk
stop_words.add(',')
stop_words.add('.')

### Add new entity

In [13]:
def add_entity(word, word_org, pos, dep, dict_entites):

  """
  Params:
  word (str): The word after stemming has been applied
  word_org (str): The word before stemming has been applied (the original)
  pos (str): Part-of-Speech ('POS') of the word
  dep (str): The dependency label of the word
  dict_entities (dict): the current dictionary of entites

  Returns:
  to_enter (dict): The dict that matches the current word after update
  """
  to_enter = {'original_word': word_org, 'count': 1, 'noun': 0, 'subject': 0, 'compound': 0, 'gerund': 0, 'target': 0}
  if word in dict_entites.keys():
    to_enter = dict_entites[word]
    to_enter['count'] += 1
  if word_org in entites_calculate:

    to_enter['target'] = 1


  for entity in entites_calculate:
      if entity == word_org or entity == word:
        to_enter['target'] = 1


  # Based on the Part of Speech and dependency of the word we increase the
  # relevant counter in "to_enter" by one.
  if 'NN' in pos:
    to_enter['noun'] += 1

  if 'VBG' in pos:
    to_enter['gerund'] += 1

  if 'nsubj' == dep:
    to_enter['subject'] += 1

  if 'compound' == dep:
    to_enter['compound'] += 1

  return to_enter



### Loop over the stories and create the entities

In [22]:
visited = set()
dict_entites = {}

with open(user_story_dataset,encoding='cp1252') as f:
  lines = f.readlines() # List of all User stories seperated by '\n'.

for line in lines:
  doc = nlp(line)
  tokenized_doc = []
  tokenized_doc = [token for token in doc if token.text.lower() not in stop_words]
  tokenized_doc = [token for token in tokenized_doc if len(token.text) > 1]


  tokens = []     # Tokens is a list which will hold every token in each doc.

  for token in tokenized_doc:
    tokens.append(token)
    word = stemmer.stem(token.text).lower()    # Stemmed version of the current word (token).
    word_org = token.text                      # The original.
    if word_org in ['estimator', 'estimators']:                # TODO: fix case where estimator's stemming doesnt work correctly
      word = 'estimator'

    pos = token.tag_
    if 'NN' in pos:          # If the word's PoS (part of speech) is a noun ('NN'),
      visited.add(word)       # add it to visited (set)

  for index in range(len(tokens) - 2):
    # Iterate through the tokens, 3 at a time, note that this way the last two
    # words gets over looked and should be treated after the loop ends.

    word_org_1 = tokens[index].text            # Three original words
    word_org_2 = tokens[index + 1].text
    word_org_3 = tokens[index + 2].text

    word_1 = stemmer.stem(word_org_1).lower()  # Three matching stemmed words
    word_2 = stemmer.stem(word_org_2).lower()
    word_3 = stemmer.stem(word_org_3).lower()


    if word_org_1 in ['estimator', 'estimators']:  # Treating the case of estimator specificly
      word_1 = 'estimator'
    if word_org_2 in ['estimator', 'estimators']:
      word_2 = 'estimator'
    if word_org_3 in ['estimator', 'estimators']:
      word_3 = 'estimator'

    pos_1 = tokens[index].tag_       # Get each token's PoS (part of speech)
    dep_1 = tokens[index].dep_        # Get each token's dependency label
    pos_2 = tokens[index + 1].tag_
    dep_2 = tokens[index + 1].dep_
    pos_3 = tokens[index + 2].tag_
    dep_3 = tokens[index + 2].dep_

    if word_1 in visited:  # Meaning word_1 is a stemmed noun which we visited already


    #### USAGE OF SPECIFIC CONDITIONS DERIVED FROM ARTICLE file:///C:/Users/X260/Downloads/s00766-017-0270-1.pdf:

      dict_entites[word_1] = add_entity(word_1, word_org_1,
                                        pos_1, dep_1, dict_entites)  # Update dict_entites

      if 'NN' in pos_2: # If pos_2 is a noun:

        if 'VBG' in pos_1: # If the part of speech of word 1 is a verb:
          pos_1 = 'VBG'
        if 'nsubj' in dep_2: # If the dependency of word 2 is a nominal subject:
          dep_1 = 'nsubj'
        if 'compound' in dep_2: # If the dependency of word 2 is a compound noun:
          dep_1 = 'compound'

        dict_entites[word_1 + ' ' + word_2] = add_entity(word_1 + ' ' + word_2,
                    word_org_1 + ' ' + word_org_2, pos_1, dep_1, dict_entites) # update the dict with the new compound noun


      if 'NN' in pos_3: # If pos_3 is a noun:

        if 'VBG' in pos_1 or 'VBG' in pos_3: # if either pos_1 or pos_3 are gerund verbs:
          pos_1 = 'VBG'
        if 'nsubj' in dep_3: # if the dependency of word 2 is a nominal subject:
          dep_1 = 'nsubj'
        if 'compound' in dep_3: # if the dependency of word 3 is a compound noun:
          dep_1 = 'compound'

        dict_entites[word_1 + ' ' + word_2 + ' ' + word_3] = add_entity(word_1 + ' ' + word_2 + ' ' + word_3, word_org_1 + ' ' +
                           word_org_2 + ' ' + word_org_3, pos_1, dep_1, dict_entites) # update the dict with the new compound noun



  # Treating the last words:
  try:
    last_word_org_1 = tokens[-2].text
    last_word_org_2 = tokens[-1].text
    last_word_1 = stemmer.stem(last_word_org_1).lower()
    last_word_2 = stemmer.stem(last_word_org_2).lower()
    if last_word_1 in visited:
      if last_word_org_1 in ['estimator', 'estimators']:
        last_word_1 = 'estimator'
      if last_word_org_2 in ['estimator', 'estimators']:
        last_word_2 = 'estimator'
      last_pos_1 = tokens[-2].tag_
      last_dep_1 = tokens[-2].dep_
      last_pos_2 = tokens[-1].tag_
      last_dep_2 = tokens[-1].dep_
      dict_entites[last_word_1] = add_entity(last_word_1, last_word_org_1,
                                          last_pos_1, last_dep_1, dict_entites)
      if compound_check_pair(last_pos_1, last_pos_2):
        dict_entites[last_word_1 + ' ' + last_word_2] = add_entity(last_word_1 + ' ' + last_word_2,
                        last_word_org_1 + ' ' + last_word_org_2, last_pos_1, last_dep_1, dict_entites)

    if last_word_2 in visited:
      dict_entites[last_word_2] = add_entity(last_word_2, last_word_org_2,
                                        last_pos_2, last_dep_2, dict_entites)
  except:
    pass

In [23]:
def compound_check_pair(pos_1, pos_2):
  # checks if two words are likely a compound noun by checking thier PoSs.
  # returns True or False accordingly.
  if 'NN' in pos_1:
      if 'NN' in pos_2 or 'VBG' in pos_2 or 'IN' in pos_2 or 'JJ' in pos_2:
        return True

      elif 'NN' in pos_2:
        if 'VBG' in pos_1 or 'IN' in pos_1 or 'JJ' in pos_1:
          return True

      return False


def compound_check_triplet(pos_1, pos_2, pos_3):
  # checks if three words are likely a compound noun by checking thier PoSs.
  # returns True or False accordingly.
  if 'NN' in pos_1 and 'IN' in pos_2 and 'NN' in pos_3:
    return True
  return False

### N-Grams:

In [25]:
with open(user_story_dataset,encoding='cp1252') as f:
  lines = f.readlines() # List of all User stories seperated by '\n'.

# Initialize a dictionary to store the n-grams and their counts
ngram_counts = {}

for line in lines:
  words = line.split()
  tokenized_doc = [token for token in words if token.lower() not in stop_words]
  # Loop over each set of 3 adjacent words and count the n-grams
  for i in range(len(tokenized_doc) - 3):
    ngram = tuple(tokenized_doc[i : i + 4])
    if ngram in ngram_counts:
        ngram_counts[ngram] += 1
    else:
        ngram_counts[ngram] = 1

frequent_ngrams = [k for k, v in ngram_counts.items() if v > 1]
frequent_words_in_ngrams = set()
for ngram in frequent_ngrams:
  for word in ngram:
    frequent_words_in_ngrams.add(word)

### DF creating

In [26]:
# setting up the features per word
data = {'word': [], 'original word': [], 'count': [], 'noun': [], 'subject': [], 'compound': [],
        'gerund': [],'part_of_frequent_ngram':[],'target': []}
i=0
for key in tqdm(dict_entites.keys()):

  data['word'].append(key)
  data['original word'].append(dict_entites[key]['original_word'])
  data['count'].append(dict_entites[key]['count'])
  data['noun'].append(dict_entites[key]['noun'])
  data['subject'].append(dict_entites[key]['subject'])
  data['compound'].append(dict_entites[key]['compound'])
  data['gerund'].append(dict_entites[key]['gerund'])
  if key in frequent_words_in_ngrams:
    data['part_of_frequent_ngram'].append(1)
  else:
    data['part_of_frequent_ngram'].append(0)

  data['target'].append(dict_entites[key]['target'])
  i+=1
df = pd.DataFrame(data)

100%|██████████| 182/182 [00:00<00:00, 91322.33it/s]


In [27]:
# code for adding to an existing df 3 new features:
# User_Role: +1 for each time the word appeared in the user role section of the story (group 1)
# Action: +1 for each time the word appeared in the Action section of the story (group 2)
# Benefit: +1 for each time the word appeared in the Benefit section of the story (group 3)

frequency_dict = {}

pattern = r"As (?:an? )?(.+?), I want (.+?), so that (.+)"
for line in lines:

# match the pattern to the user story
  match = re.match(pattern, line)
  if match is not None:

    tokenized_user_role = [stemmer.stem(token.text).lower() for token in nlp(match.group(1)) if token.text.lower() not in stop_words]
    tokenized_action = [stemmer.stem(token.text).lower() for token in nlp(match.group(2)) if token.text.lower() not in stop_words]
    tokenized_benefit = [stemmer.stem(token.text).lower() for token in nlp(match.group(3)) if token.text.lower() not in stop_words]
    for word in tokenized_user_role:
      if word not in frequency_dict:
        frequency_dict[word] = [1,0,0]
      else:
        frequency_dict[word][0] += 1

    for word in tokenized_action:
      if word not in frequency_dict:
        frequency_dict[word] = [0,1,0]
      else:
        frequency_dict[word][1] += 1

    for word in tokenized_benefit :
      if word not in frequency_dict:
        frequency_dict[word] = [0,0,1]
      else:
        frequency_dict[word][2] += 1


frequency_dict = pd.DataFrame(frequency_dict).transpose().rename(columns={0: "User Role",1: "Action",2: "Benefit"}).rename_axis('word')

# merge the dataframes based on the 'word' column
df_merged = pd.merge(df, frequency_dict, on='word', how='left')

# group by 'word' and sum the values of the 3 new columns
df = df_merged.groupby('word').agg({'original word':'first',
                                           'count': 'first',
                                           'noun': 'first',
                                           'subject': 'first',
                                           'compound': 'first',
                                           'gerund': 'first',
                                           'part_of_frequent_ngram': 'first',
                                           'User Role': 'sum',
                                           'Action': 'sum',
                                           'Benefit': 'sum',
                                           'target':'first'}).reset_index()






### Standardize

In [30]:
# standardize the data
# df = pd.read_csv('/content/grocery.csv')
# Store original column names of DataFrame df
org_columns = df.columns

# Select a subset of columns from DataFrame df using indexing and store in a new variable
columns = df.columns[2:16]
scaler = MinMaxScaler()
columns_to_scale = columns

df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Reorder the columns of the DataFrame back to their original positions
df = df[org_columns]

In [31]:
df

Unnamed: 0,word,original word,count,noun,subject,compound,gerund,part_of_frequent_ngram,User Role,Action,Benefit,target
0,anomali,anomalies,0.018182,0.035714,0.00,0.00,0.0,0.0,0.0,0.00,0.142857,0.0
1,answer,answers,0.000000,0.017857,0.00,0.00,0.0,0.0,0.0,0.00,0.071429,0.0
2,answer urgent question,answers urgent questions,0.000000,0.017857,0.00,0.00,0.0,0.0,0.0,0.00,0.000000,0.0
3,app,app,0.018182,0.035714,0.00,0.00,0.0,0.0,0.0,0.00,0.142857,0.0
4,applic,application,0.000000,0.017857,0.00,0.00,0.0,0.0,0.0,0.05,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
177,time slot,time slot,0.000000,0.017857,0.00,0.25,0.0,0.0,0.0,0.00,0.000000,0.0
178,time slot deliveri,time slot delivery,0.000000,0.017857,0.00,0.25,0.0,0.0,0.0,0.00,0.000000,0.0
179,transpar,transparency,0.000000,0.017857,0.00,0.00,0.0,0.0,0.0,0.00,0.000000,0.0
180,transpar deliveri,transparency delivery,0.000000,0.017857,0.25,0.00,0.0,0.0,0.0,0.00,0.000000,0.0


### Write to csv

In [32]:
df.to_csv(OUTPUT_PATH, index=False)