In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

DEPENDENT_DATA = "./AllData.csv"
INDEPENDENT_DATA = "AllIndependentData.csv"
DEPENDENT_PAIRS = "Release_Level1PairsReadyData.csv"
DEPENDENT_PAIRS2 = "Release_Level2PairsReadyData.csv"
INDEPENDENT_PAIRS = "negSamplePairs.csv"

# Reading the data

In [None]:
df_d = pd.read_csv(DEPENDENT_DATA, low_memory=False)
df_i = pd.read_csv(INDEPENDENT_DATA)
df_dp = pd.read_csv(DEPENDENT_PAIRS)
df_dp2 = pd.read_csv(DEPENDENT_PAIRS2)
df_ip = pd.read_csv(INDEPENDENT_PAIRS)

df_dp = df_dp.append(df_dp2)

### Printing out some initial Statistics

In [None]:
print("Total amount of data points for dependent requirements: {}".format(len(df_d)))
print("Total amount of data points for indepedent requirements: {}".format(len(df_i)))
print("Total amount of data points for dependent pairs: {}".format(len(df_dp)))
print("Total amount of data points for independent pairs: {}".format(len(df_ip)))

print("Total amount of \'duplicate data\' points: " + str(len(df_d[df_d["duplicates"].notnull()])))

## Some more preprocessing 

In [None]:
## only looking at requirements that contain enhancements
df_i = df_i[df_i["type"] == "enhancement"]

# Independent statistics

In [None]:
#american = df['nationality'] == "USA"
#df[df['first_name'].notnull() & (df['nationality'] == "USA")]

### Plot showing the product variation of dependent requirments

In [None]:
ig, ax = plt.subplots()
df_i['product'].value_counts().plot(ax=ax, kind='bar', figsize = (20,10), title = "Product count for Independent Requirements")

### Showing counts of columns "type", "count", "priority", "classification"

In [None]:
total_records = len(df_i)

type_count = df_i['type'].value_counts(normalize = True) 
product_count = df_i["product"].value_counts(normalize = True)
priority_count = df_i["priority"].value_counts(normalize = True)
classification_count = df_i["classification"].value_counts(normalize = True) 

print("Product Count")
print(product_count)
print("---------------------------")
print("Priority Count")
print(priority_count)
print("---------------------------")
print("Classification Count")
print(classification_count)
print("---------------------------")

# Dependent statistics

### Plot showing the product variation of dependent requirments

In [None]:
ig, ax = plt.subplots()
df_d['product'].value_counts().plot(ax=ax, kind='bar', figsize = (20,10), title = "Product count for Dependent Requirements")

### Showing counts of columns "type", "count", "priority", "classification"

In [None]:
total_records = len(df_d)

type_count = df_d['type'].value_counts(normalize = True)
product_count = df_d["product"].value_counts(normalize = True)
priority_count = df_d["priority"].value_counts(normalize = True)
classification_count = df_d["classification"].value_counts(normalize = True)


print("Type Count")
print(type_count)
print("---------------------------")
print("Product Count")
print(product_count)
print("---------------------------")
print("Priority Count")
print(priority_count)
print("---------------------------")
print("Classification Count")
print(classification_count)
print("---------------------------")


### Counting the amount of duplicates

In [None]:
len(df_d[df_d["duplicates"].notnull()]) / total_records

### Finding cross-project requirement interdepencies

In [None]:
cross_project = ((len(df_dp[df_dp["req1Product"] != df_dp["req2Product"]])) / len(df_dp)) * 100

print("The amount of cross_project dependencies is {:.2f}%".format(cross_project))

### Measure the length of the description of the requirements

In [None]:
counts = []
for index, row in df_d.iterrows():
    counts.append(len(row["summary"].split(" ")))


counts_pd = pd.Series(counts)
df_d["summary_values"] = counts_pd

In [None]:
unique_products = df_d["product"].unique()

totals = []

total = 0

for prod in unique_products:
    for index, row in df_d.iterrows():
        if row["product"] == prod:
            total = total + int(row["summary_values"])
    
    prod_total = len(df_d[df_d["product"] == prod])
    average = total/prod_total
    totals.append({prod:"{:.2f}".format(average)})
    
    total = 0  
    
totals

## Dropping columns so that dataframes are the same when we append

In [None]:
unimportant_columns = ["req1CreationTime", "req2Creation_time", "req1Keywords", "req2Keywords", "req1Class", "req2Class"]
df_ip = df_ip.drop(columns = unimportant_columns)

In [None]:
unimportant_columns = ["CosSimilarity", "SemSimilarity"]
df_dp = df_dp.drop(columns = unimportant_columns)

### Only selecting those requirement pairs within the same project

In [None]:
df_dp = df_dp[df_dp["req1Product"] == df_dp["req2Product"]]
df_ip = df_ip[df_ip["req1Product"] == df_ip["req2Product"]]

### Adding the independent data set to the dependent data set 

In [None]:
create_full_set = False

In [None]:
if create_full_set == True:
    all_dp = df_dp.copy()
    all_ip = df_ip.copy()
    indexes_added = []
    total_added = 0
    total = len(df_dp)
    indexes = len(df_ip)

    while total_added < total:
        index = random.randint(0,indexes)
        if index not in indexes_added:
            indexes_added.append(index)
            all_dp = all_dp.append(df_ip.iloc[index])
            total_added = total_added + 1
        else:
            print("Currently at {:.2f}%".format(total_added/total * 100))


    all_dp.to_csv("ModelPairs.csv")

# Feature Selection and Processing - Preperation

In [None]:
all_dp = pd.read_csv("ModelPairs.csv", low_memory = False)

In [None]:
all_dp.columns