In [1]:
import pandas as pd
import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(parent_dir)

from visual_genome.local import VisualGenome
from cross_validation import cross_validate
from metrics import results_to_mean_confidence_interval
from transformations import get_transforms

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
vg = VisualGenome()

Script directory: C:\Users\karab\Desktop\Visual Genome Driver\visual_genome
Data directory: C:\Users\karab\Desktop\Visual Genome Driver\data
Loading data...
Data loaded.


In [3]:
IMAGES = "images.txt" # sample
FEATURES = "../features.csv" # all features
COLUMNS = ["image_id", "# of objects", "# of SAM segmentations", "# of FC-CLIP classes", "avg_region_similarity", "avg_rel_similarity", "predicted_complexity"]

In [4]:
# there should be a file called "images.txt"
with open(IMAGES, "r") as f:
    image_ids = [int(line) for line in f]
with open(FEATURES, "r") as f:
    features = pd.read_csv(f)
 
features = features[COLUMNS]                                # get only the columns we need
features = features[features["image_id"].isin(image_ids)]   # get only the rows we need

# change name
features = features.rename(columns={"# of SAM segmentations": "num_seg_64points", "# of FC-CLIP classes": "num_classes", "# of objects": "num_objects"})
features

Unnamed: 0,image_id,num_objects,num_seg_64points,num_classes,avg_region_similarity,avg_rel_similarity,predicted_complexity
25,26,50,290,14,0.199446,0.340414,0.7170
435,436,41,250,12,0.183451,0.308560,0.7294
599,600,41,215,8,0.287671,0.346867,0.6593
617,618,42,263,10,0.303625,0.290254,0.6359
629,630,45,231,13,0.207449,0.253693,0.7101
...,...,...,...,...,...,...,...
106618,2416460,16,17,3,0.427889,0.558816,0.2180
106841,2416698,45,232,9,0.240227,0.269124,0.6267
106961,2416829,48,343,14,0.267573,0.287617,0.6584
107343,2417227,17,42,5,0.373228,0.335595,0.3537


### Cross Validated Linear Regression

In [5]:
dataset_result = {}

def run_regression(data, N, M):
    df = data.copy()
    get_transforms(df)
    return cross_validate(df, N=N, M=M)

dataset_result["vg"] = run_regression(features, N=3, M=1)

In [6]:
results_stats = results_to_mean_confidence_interval(dataset_result)
print("Reporting Spearman correlations.")

for dset, v in results_stats.items():
    print("\nDATASET: {}".format(dset))
    for mod, vv in v.items():
        for s, (m, i) in vv.items():
            if s == "spearman_test":
                print(mod, m)

Reporting Spearman correlations.

DATASET: vg
sqrt_seg_64points 0.8386251004668753
sqrt_num_classes 0.7517135851415208
sqrt_seg_64points + sqrt_num_classes 0.8549069060180433
sqrt_seg_64points_x_sqrt_num_classes 0.837368986352011
sqrt_seg_64points_x_sqrt_num_classes + sqrt_seg_64points + sqrt_num_classes 0.853011291625036
avg_region_similarity 0.5194889290519447
avg_rel_similarity 0.5340093025731418
avg_region_similarity + avg_rel_similarity 0.552878299279408
sqrt_seg_64points + avg_region_similarity 0.8492408847035916
sqrt_seg_64points + avg_rel_similarity 0.8486722223375827
sqrt_num_classes + avg_region_similarity 0.7521693716527064
sqrt_num_classes + avg_rel_similarity 0.7532279751608902
sqrt_seg_64points + sqrt_num_classes + avg_region_similarity 0.8584352635535445
sqrt_seg_64points + sqrt_num_classes + avg_rel_similarity 0.8590514979425553
sqrt_seg_64points + sqrt_num_classes + avg_region_similarity + avg_rel_similarity 0.859644432091096
seg_64points_norm 0.8386251004668753
seg_64