In [7]:
import torch
from torch import nn

import torchvision
from torchvision.datasets import ImageFolder

from torchvision import transforms

from torch.utils.data import DataLoader
from pathlib import Path
import pandas as pd

In [2]:
import sys
sys.path.append("..")

In [3]:
from video_classification.datasets import FolderOfFrameFoldersDataset, FrameWindowDataset

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
ROOT = Path("/home/ubuntu/SupervisedVideoClassification")
DATA_ROOT = Path(ROOT/"data")

In [6]:
valid_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
])

In [7]:
valid_ds = FolderOfFrameFoldersDataset(DATA_ROOT/'validation', 
                                       transform=valid_transforms, 
                                       base_class=FrameWindowDataset,
                                       window_size=3,
                                       overlapping=True,)

In [8]:
from torch import nn
from torchvision.models import resnet101
from video_classification.models.mlp import MLP


class SingleImageResNetModel(nn.Module):
    def __init__(self, mlp_sizes=[768, 128, 2]):
        super().__init__()
        resnet = resnet101(pretrained=True)
        modules = list(resnet.children())[:-1]
        self.resnet = nn.Sequential(*modules)

        self.clf = MLP(2048, mlp_sizes)
        self.freeze_resnet()

    def forward(self, x):
        x = self.resnet(x).squeeze()
        x = self.clf(x)
        return x

    def freeze_resnet(self):
        for p in self.resnet.parameters():
            p.requires_grad = False

    def unfreeze_resnet(self):
        for p in self.resnet.parameters():
            p.requires_grad = True


In [9]:
import torch
from torch import nn
from video_classification.models.mlp import MLP


class MultiImageModel(nn.Module):
    def __init__(self,
                 window_size=3,
                 single_mlp_sizes=[768, 128],
                 joint_mlp_sizes=[64, 2]):
        super().__init__()
        self.window_size = window_size
        self.single_mlp_sizes = single_mlp_sizes
        self.joint_mlp_sizes = joint_mlp_sizes
        
        self.single_image_model = SingleImageResNetModel(self.single_mlp_sizes)
        self.in_features = self.single_mlp_sizes[-1] * self.window_size
        self.clf = MLP(self.in_features, joint_mlp_sizes)

    def forward(self, x):
        # x is of size [B, T, C, H, W]. In other words, a batch of windows.
        # each img for the same window goes through SingleImageModel
        x = x.transpose(0, 1)  # -> [T, B, C, H, W]
        x = torch.cat([self.single_image_model(window) for window in x], 1)
        # x is now of size [B, T * single_mlp_sizes[-1]]
        
        x = self.clf(x)
        # Now size is [B, joint_mlp_sizes[-1]] which should always be 2

        return x
    
    def freeze_single_image_model(self):
        # Freeze the VGG classifier
        for p in self.single_image_model.parameters():
            p.requires_grad = False
            
    def unfreeze_single_image_model(self):
        # Unfreeze the VGG classifier. Training the whole VGG is a no-go, so we only train the classifier part.
        for p in self.single_image_model.clf.parameters():
            p.requires_grad = True 

In [10]:
best_single_image_model = SingleImageResNetModel(mlp_sizes=[1024, 256, 2])

best_single_image_model.load_state_dict(torch.load(ROOT/"checkpoints/single_frame_resnet_SingleImageResNetModel_39_f1=0.8797445.pth"))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [11]:
best_triple_image_model = MultiImageModel(
                 window_size=3,
                 single_mlp_sizes=[1024, 256],
                 joint_mlp_sizes=[128, 2])
best_triple_image_model.load_state_dict(torch.load(ROOT/"checkpoints/multi_frame_resnet101_from_scratch_MultiImageModel_33_f1=0.8941237.pth"))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [12]:
best_single_image_model = best_single_image_model.to(device)
best_triple_image_model = best_triple_image_model.to(device)

In [13]:
x = torch.stack([valid_ds[0][0], valid_ds[1][0], valid_ds[2][0], valid_ds[3][0]]).to(device)

# Error Analysis

In [51]:
"""
from tqdm import tqdm
import numpy as np

valid_loader = DataLoader(valid_ds, batch_size=128, shuffle=False)
single_img_probs = []
triple_img_probs = []
y_true = []
with torch.no_grad():
    for i, (x, y) in enumerate(tqdm(valid_loader)):
        x = x.to(device) # B, T, C, H, W
        
        single_img_batch_pred = torch.softmax(best_single_image_model(x[:, -1, :, :, :]), dim=-1).cpu().tolist()
        triple_img_batch_pred = torch.softmax(best_triple_image_model(x), dim=-1).cpu().tolist()
        batch_true = y.tolist()
        single_img_probs.extend((y for y in single_img_batch_pred))
        triple_img_probs.extend((y for y in triple_img_batch_pred))
        y_true.extend((y for y in batch_true))
        
single_img_probs = np.array(single_img_probs)
single_img_pred = np.argmax(single_img_probs, 1)

triple_img_probs = np.array(triple_img_probs)
triple_img_pred = np.argmax(triple_img_probs, 1)
y_true = np.array(y_true)

import pandas as pd

df = pd.DataFrame.from_dict({
    'single_prob': single_img_probs[:, 1].tolist(),
    'single_pred': single_img_pred.tolist(),
    'triple_prob': triple_img_probs[:, 1].tolist(),
    'triple_pred': triple_img_pred.tolist(),
    'y_true': y_true,
})

"""

"\nfrom tqdm import tqdm\nimport numpy as np\n\nvalid_loader = DataLoader(valid_ds, batch_size=128, shuffle=False)\nsingle_img_probs = []\ntriple_img_probs = []\ny_true = []\nwith torch.no_grad():\n    for i, (x, y) in enumerate(tqdm(valid_loader)):\n        x = x.to(device) # B, T, C, H, W\n        \n        single_img_batch_pred = torch.softmax(best_single_image_model(x[:, -1, :, :, :]), dim=-1).cpu().tolist()\n        triple_img_batch_pred = torch.softmax(best_triple_image_model(x), dim=-1).cpu().tolist()\n        batch_true = y.tolist()\n        single_img_probs.extend((y for y in single_img_batch_pred))\n        triple_img_probs.extend((y for y in triple_img_batch_pred))\n        y_true.extend((y for y in batch_true))\n        \nsingle_img_probs = np.array(single_img_probs)\nsingle_img_pred = np.argmax(single_img_probs, 1)\n\ntriple_img_probs = np.array(triple_img_probs)\ntriple_img_pred = np.argmax(triple_img_probs, 1)\ny_true = np.array(y_true)\n\nimport pandas as pd\n\ndf = pd.

Already computed, no point in stressing the GPU.

In [8]:
df = pd.read_csv("resnet_error_analysis.csv")

In [9]:
df.head()

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true
0,0.665277,1,0.728379,1,0
1,0.013133,0,0.065744,0,0
2,0.06027,0,0.050257,0,0
3,0.139477,0,0.03888,0,0
4,0.105147,0,0.013103,0,0


In [10]:
# df.to_csv("resnet_error_analysis.csv", index=False)

In [11]:
triple_better = df.query("triple_pred == y_true and single_pred != y_true")

In [12]:
triple_better.query("y_true == 1")

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true
165,0.249619,0,0.703304,1,1
166,0.207608,0,0.969881,1,1
1000,0.479315,0,0.896553,1,1
1001,0.375707,0,0.99558,1,1
1002,0.000352,0,0.526612,1,1
1161,0.018975,0,0.955341,1,1
1441,0.22574,0,0.605734,1,1
1446,0.000369,0,0.546458,1,1
1645,0.46654,0,0.958764,1,1
1648,0.133172,0,0.971643,1,1


In [13]:
valid_ds = FolderOfFrameFoldersDataset(DATA_ROOT/'validation', 
                                       transform=transforms.ToTensor(), 
                                       base_class=FrameWindowDataset,
                                       window_size=3,
                                       overlapping=True,)


def to_pil(img_num):
    to_pil_f = transforms.ToPILImage()
    return to_pil_f(valid_ds[img_num][0][-1])

In [15]:
df.iloc[178:198]

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true
178,1.7e-05,0,0.000158,0,0
179,0.007526,0,0.000491,0,0
180,0.0061,0,0.000296,0,0
181,0.000919,0,0.001303,0,0
182,0.000234,0,0.000536,0,0
183,0.817705,1,0.009901,0,0
184,0.083664,0,0.015741,0,0
185,0.005384,0,0.216451,0,0
186,0.045557,0,0.022155,0,0
187,0.00717,0,0.025665,0,0


In [17]:
df.iloc[2165:2175]

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true
2165,0.07251,0,0.00224,0,0
2166,0.104041,0,0.000406,0,0
2167,0.220803,0,0.034284,0,0
2168,0.365786,0,0.274351,0,1
2169,0.546352,1,0.304895,0,1
2170,0.34513,0,0.747933,1,1
2171,0.061634,0,0.015091,0,1
2172,0.087571,0,0.001173,0,1
2173,0.004598,0,0.017378,0,1
2174,0.011133,0,0.003423,0,1


This ad-hoc analysis does not seem to yield any visible patterns. Triple does better than Single on seemingly random images.

To reduce our search space, we should filter out to the Triple model making at least 3 successful predictions in a row, and compare against first the rows for which Single makes 0 correct predictions, or just 1 and take it from there.

Maybe `pd.rolling` can be of use for this?

In [18]:
df.head()

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true
0,0.665277,1,0.728379,1,0
1,0.013133,0,0.065744,0,0
2,0.06027,0,0.050257,0,0
3,0.139477,0,0.03888,0,0
4,0.105147,0,0.013103,0,0


In [22]:
df['triple_win_sum'] = df['triple_pred'].rolling(window=3).sum()

In [23]:
df['single_win_sum'] = df['single_pred'].rolling(window=3).sum()

In [24]:
df.head()

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
0,0.665277,1,0.728379,1,0,,
1,0.013133,0,0.065744,0,0,,
2,0.06027,0,0.050257,0,0,1.0,1.0
3,0.139477,0,0.03888,0,0,0.0,0.0
4,0.105147,0,0.013103,0,0,0.0,0.0


In [26]:
triple_better = df.query("triple_pred == y_true and triple_win_sum >=3 and single_pred != y_true")
triple_better.shape

(25, 7)

In [27]:
triple_better

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
166,0.207608,0,0.969881,1,1,3.0,1.0
1002,0.000352,0,0.526612,1,1,3.0,0.0
1161,0.018975,0,0.955341,1,1,3.0,2.0
1648,0.133172,0,0.971643,1,1,3.0,2.0
1649,0.01059,0,0.934033,1,1,3.0,1.0
1655,0.091058,0,0.981753,1,1,3.0,2.0
1656,0.058254,0,0.78466,1,1,3.0,1.0
1657,0.097067,0,0.925529,1,1,3.0,0.0
1658,0.143769,0,0.924846,1,1,3.0,0.0
1659,0.157807,0,0.951466,1,1,3.0,0.0


In [43]:
single_better = df.query("single_pred == y_true and single_win_sum >=3 and triple_pred != y_true")
single_better.shape

(1, 7)

In [44]:
single_better

Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
1166,0.889311,1,0.481416,0,1,2.0,3.0


In [62]:
single_worst = df.query("single_pred != y_true and triple_pred == y_true")
single_worst_sorted = single_worst.sort_values(by='single_prob')
print(single_worst_sorted.shape)
single_worst_sorted.head(10)

(310, 7)


Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
1660,0.000138,0,0.990266,1,1,3.0,0.0
1002,0.000352,0,0.526612,1,1,3.0,0.0
1446,0.000369,0,0.546458,1,1,1.0,1.0
3266,0.001076,0,0.509779,1,1,3.0,1.0
2269,0.001818,0,0.999777,1,1,3.0,1.0
2630,0.008648,0,0.736205,1,1,2.0,1.0
1649,0.01059,0,0.934033,1,1,3.0,1.0
3264,0.011642,0,0.55521,1,1,3.0,1.0
2270,0.01486,0,0.634572,1,1,3.0,0.0
2977,0.014873,0,0.588127,1,1,2.0,1.0


In [75]:
triple_worst = df.query("triple_pred != y_true and single_pred == y_true")
triple_worst_sorted = triple_worst.sort_values(by='triple_prob')
print(triple_worst_sorted.shape)
triple_worst_sorted.head(10)

(250, 7)


Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
1685,0.839649,1,0.000751,0,1,0.0,1.0
3979,0.555472,1,0.007083,0,1,0.0,1.0
1748,0.883547,1,0.017706,0,1,0.0,2.0
2616,0.573211,1,0.017986,0,1,0.0,2.0
3375,0.87686,1,0.023251,0,1,0.0,2.0
2613,0.872341,1,0.031472,0,1,0.0,1.0
1692,0.529224,1,0.034641,0,1,0.0,1.0
3373,0.551937,1,0.046067,0,1,0.0,1.0
4126,0.693153,1,0.078751,0,1,0.0,1.0
2627,0.745591,1,0.079431,0,1,0.0,1.0


In [102]:
triple_worst = df.query("triple_pred != y_true and single_pred != y_true")
triple_worst_sorted = triple_worst.sort_values(by='single_prob')
print(triple_worst_sorted.shape)
triple_worst_sorted.head(10)

(438, 7)


Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
1701,3e-06,0,0.001827,0,1,0.0,0.0
1670,4e-06,0,0.000652,0,1,0.0,0.0
3977,5e-06,0,0.000185,0,1,0.0,0.0
1683,6e-06,0,0.01488,0,1,0.0,0.0
1678,1.1e-05,0,0.000644,0,1,0.0,0.0
1671,1.1e-05,0,0.003562,0,1,0.0,0.0
1751,1.5e-05,0,0.000507,0,1,0.0,0.0
1666,2.6e-05,0,0.042088,0,1,1.0,1.0
1714,3e-05,0,0.067913,0,1,0.0,0.0
1442,3.6e-05,0,0.164584,0,1,1.0,0.0


In [110]:
triple_better = df.query("triple_pred == y_true and triple_win_sum >=3 and single_pred != y_true")
triple_better_sorted = triple_better.sort_values(by='triple_prob', ascending=False)
print(triple_better_sorted.shape)
triple_better_sorted.head(10)

(25, 7)


Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
2269,0.001818,0,0.999777,1,1,3.0,1.0
3267,0.243609,0,0.994873,1,1,3.0,1.0
1663,0.430688,0,0.99388,1,1,3.0,1.0
3839,0.445508,0,0.992016,1,1,3.0,1.0
1660,0.000138,0,0.990266,1,1,3.0,0.0
1655,0.091058,0,0.981753,1,1,3.0,2.0
1661,0.038954,0,0.973036,1,1,3.0,0.0
1648,0.133172,0,0.971643,1,1,3.0,2.0
166,0.207608,0,0.969881,1,1,3.0,1.0
3848,0.042187,0,0.960272,1,1,3.0,1.0


In [119]:
single_better = df.query("single_pred == y_true and single_win_sum >=3 and triple_pred != y_true")
single_better_sorted = single_better.sort_values(by='single_prob', ascending=False)
print(single_better_sorted.shape)
single_better_sorted.head(10)

(1, 7)


Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
1166,0.889311,1,0.481416,0,1,2.0,3.0


In [129]:
both_confused = df.query(" 0.4 <= single_prob <= 0.6 and 0.4 <= triple_prob <= 0.6 and (single_pred != y_true and triple_pred != y_true)")
both_confused_sorted = both_confused.sort_values(by='triple_prob')
print(both_confused_sorted.shape)
both_confused_sorted.head(10)

(3, 7)


Unnamed: 0,single_prob,single_pred,triple_prob,triple_pred,y_true,triple_win_sum,single_win_sum
3853,0.581041,1,0.52983,1,0,1.0,2.0
4565,0.539185,1,0.556789,1,0,1.0,1.0
2553,0.578937,1,0.569448,1,0,1.0,2.0
