## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval

### Global Variables

In [None]:
binary_results = "Results/PredictionScores_BinaryClass.csv"
multiclass_results = "Results/PredictionScores_MultiClass.csv"
cross_project_dependencies = "Cross-Project-Dependencies.csv"
dependent_pairs = "Results/AllDependentPairs.csv"

### Import Model Predicitons

In [None]:
df_predictions = pd.read_csv(binary_results)
df_multiclass_predictions = pd.read_csv(multiclass_results)
df_cross_project = pd.read_csv(cross_project_dependencies)
df_pairs = pd.read_csv("Teddy_Data/ModelPairs.csv", low_memory = False)

In [None]:
df_predictions["Train Project"].unique()

### Filter Out Predictions we care about

In [None]:
most_projects = ["Core", "Firefox", "Thunderbird", "Bugzilla", "Seamonkey", "DevTools", "MailNews Core", "Toolkit", "Testing", 
            "Infrastructure & Operations", "NSS"]

train_projects = ['Core', 'MailNews Core', 'SeaMonkey', 'Bugzilla', 'Firefox',
       'Other Applications', 'NSS', 'Calendar', 'Thunderbird', 'Toolkit',
       'NSPR', 'Testing', 'Firefox Build System', 'Webtools']

middle_projects = ["developer.mozilla.org", "Conduit", "Developer Documentation", "Developer Documentation", "Participation Infrastructure",
                  "Firefox for iOS", "NSPR", "mozilla.org", "Mozilla Foundation Communications", "Data Science", "Localization Infrastructure and Tools"]

x_ticks = [0,0.2,0.4,0.6,0.8,1]

In [None]:
df_predictions = df_predictions.drop(columns = "Unnamed: 0")

# Display Heat Map (Binary Class Predictions)

### Test Most

In [None]:
# Select duplicate rows except first occurrence based on all columns
df = df_predictions[df_predictions["Test Project"].isin(most_projects)]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Most Prediction Scores (BinaryClass)")
plt.show()

### Test Least

In [None]:
# Select duplicate rows except first occurrence based on all columns
df = df_predictions[df_predictions["Test Project"].isin(least_projects)]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Least Prediction Scores (BinaryClass)")
plt.show()

### Test Middle

In [None]:
# Select duplicate rows except first occurrence based on all columns
df = df_predictions[df_predictions["Test Project"].isin(middle_projects)]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Median Prediction Scores (BinaryClass)")
plt.show()

### Everything

In [None]:
df = df_predictions.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
fig, ax = plt.subplots(figsize=(25,15)) 
ax = sns.heatmap(df,cmap="YlGnBu", ax = ax)
ax.set_title("Prediction Scores (BinaryClass)")
plt.show()

# Display Heat Map (Multi Class Predictions)

### Test most

In [None]:
df = df_multiclass_predictions[df_multiclass_predictions["Test Project"].isin(most_projects)]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Prediction Scores (MultiClass)")
plt.show()

### Test middle

In [None]:
df = df_multiclass_predictions[df_multiclass_predictions["Test Project"].isin(middle_projects)]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Prediction Scores (MultiClass)")
plt.show()

### Everything

In [None]:
df = df_multiclass_predictions.pivot("Train Project", "Test Project", "Prediction Score")
fig, ax = plt.subplots(figsize=(25,15)) 
ax = sns.heatmap(df,cmap="YlGnBu", ax = ax)
ax.set_title("Prediction Scores (Multi Class)")
plt.show()

# Looking at cross-project dependencies

## Multiclass

In [None]:
s = df_cross_project.iloc[0]["Cross-Project Dependencies"]
df = df_multiclass_predictions[df_multiclass_predictions["Train Project"] == "Core"]
df = df[df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
fig, ax = plt.subplots(figsize=(25,15)) 
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Prediction Scores (MultiClass)")
plt.show()

## Binary Class

In [None]:
s = df_cross_project.iloc[0]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "Core"]
df = df[df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
fig, ax = plt.subplots(figsize=(25,15)) 
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Prediction Scores (Binary Class)")
plt.show()

### Firefox Comparisons

In [None]:
s = df_cross_project.iloc[2]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "Firefox"]
df = df[df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)

average = np.average(df["Prediction Score"])

print("Firefox has {} cross-project dependencies of these projects, the average score is {:.2f}".format(len(df["Prediction Score"]),average))

In [None]:
s = df_cross_project.iloc[2]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "Firefox"]
df = df[~df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)

average = np.average(df["Prediction Score"])

print("Firefox has {} cross-project dependencies of these projects, the average score is {:.2f}".format(len(df["Prediction Score"]),average))

In [None]:
s = df_cross_project.iloc[2]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "Firefox"]
df = df[df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
fig, ax = plt.subplots(figsize=(25,10)) 
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Firefox Prediction Scores for Cross-Dependent Projects (Binary Class)")
plt.show()

In [None]:
s = df_cross_project.iloc[2]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "Firefox"]
df = df[~df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)
df = df.pivot("Train Project", "Test Project", "Prediction Score")
fig, ax = plt.subplots(figsize=(25,10)) 
ax = sns.heatmap(df,cmap="YlGnBu")
ax.set_title("Firefox Prediction Scores for Non Cross-Dependent Projects (Binary Class)")
plt.show()

### Mailnews Core

In [None]:
s = df_cross_project.iloc[1]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "MailNews Core"]
df = df[df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)

average = np.average(df["Prediction Score"])

print("Firefox has {} cross-project dependencies of these projects, the average score is {:.2f}".format(len(df["Prediction Score"]),average))

In [None]:
s = df_cross_project.iloc[1]["Cross-Project Dependencies"]
df = df_predictions[df_predictions["Train Project"] == "MailNews Core"]
df = df[~df["Test Project"].isin(literal_eval(s))]
df = df.drop_duplicates(subset=None, keep='first', inplace=False)

average = np.average(df["Prediction Score"])

print("Firefox has {} cross-project dependencies of these projects, the average score is {:.2f}".format(len(df["Prediction Score"]),average))

# Analyzing Prediction Scores from cross-project dependencies

In [None]:
comparison_df = pd.DataFrame(columns = ["Train Project", 
                                        "Total Projects",
                                        "Cross-Dependent Projects", 
                                        "Binary Class Cross-Dependent Average Prediction Score", 
                                        #"Multi Class Cross-Dependent Average Prediction Score",
                                        #"Non Cross-Dependent Project Count",
                                        "Binary Class Non Cross-Dependent Average Prediction Score"])
                                        #"Multi Class Non Cross-Dependent Average Prediction Score"])
df_predictions.drop_duplicates()

for i in range(len(df_cross_project)):
    cross_projects = df_cross_project.iloc[i]["Cross-Project Dependencies"]
    df_name = df_cross_project.iloc[i]["Project"]
    
    if (df_name in df_predictions["Train Project"].unique()):
        ## find the predicition for binary class that have cross project dependencies
        df = df_predictions[df_predictions["Train Project"] == df_name]
        df = df[df["Test Project"].isin(literal_eval(cross_projects))]
        df = df.drop_duplicates(subset=None, keep='first', inplace=False)

        binary_cross_average = np.average(df["Prediction Score"])
        cross_count = len(df["Prediction Score"])
        
        ## find the prediction for multiclass that have cross project dependencies
        #df = df_multiclass_predictions[df_multiclass_predictions["Train Project"] == df_name]
        #df = df[df["Test Project"].isin(literal_eval(cross_projects))]
        #df = df.drop_duplicates(subset=None, keep='first', inplace=False)
        
        #multiclass_cross_average = np.average(df["Prediction Score"])
        
        ## find the prediction for binary class that don't have cross project dependencies
        df = df_predictions[df_predictions["Train Project"] == df_name]
        df = df[~df["Test Project"].isin(literal_eval(cross_projects))]
        df = df.drop_duplicates(subset=None, keep='first', inplace=False)

        non_cross_average = np.average(df["Prediction Score"])
        non_cross_count = len(df["Prediction Score"])
        
        ## find the prediction for multi class that don't have cross project dependencies
        ##df = df_multiclass_predictions[df_multiclass_predictions["Train Project"] == df_name]
        #df = df[~df["Test Project"].isin(literal_eval(cross_projects))]
        #df = df.drop_duplicates(subset=None, keep='first', inplace=False)
        
        #multiclass_non_cross_average = np.average(df["Prediction Score"])
        
        total_project = cross_count + non_cross_count

        result = {"Train Project" : df_name, 
                  "Total Projects" : total_project,
                  "Cross-Dependent Projects": cross_count,
                  "Binary Class Cross-Dependent Average Prediction Score": "{:.2f}".format(binary_cross_average), 
                  #"Non Cross-Dependent Project Count": non_cross_count,
                  "Binary Class Non Cross-Dependent Average Prediction Score": "{:.2f}".format(non_cross_average)}

        comparison_df = comparison_df.append(result, ignore_index = True)
        
        
    else:
        continue
    

In [None]:
comparison_df

### Dependent Pairs