# Initial Imports

In [None]:

from google.colab import drive
import os
import pandas as pd
import numpy as np

drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/CA684_Assignment/')



Mounted at /content/drive/


# Defining Functions

In [None]:
# define all the functions in this section 

def vname2ID(vnames):
    """Parse video digital id from its name
    vnames: a list contains file names"""
    vid = [ os.path.splitext(vn)[0][5:] for vn in vnames]
    return vid

def read_C3D(fname): # can be used to read in the lbp files as well
    """Scan vectors from file"""
    with open(fname) as f:
        for line in f:
            C3D =[float(item) for item in line.split()] # convert to float type, using default separator
    return C3D

def read_HMP(fname):
    """Scan HMP(Histogram of Motion Patterns) features from file"""
    with open(fname) as f:
        for line in f:
            pairs=line.split()
            HMP_temp = { int(p.split(':')[0]) : float(p.split(':')[1]) for p in pairs}
    # there are 6075 bins, fill zeros
    HMP = np.zeros(6075)
    for idx in HMP_temp.keys():
        HMP[idx-1] = HMP_temp[idx]            
    return HMP

def read_ColorHistogram(fname):
    """Scan Color Histogram from file
    Input file contains RGB histogram,
    Return a matrix of (3,256)"""
    RGB_Hist = np.zeros((3,256))
    with open(fname) as f:
        i_l = 0 # line index
        for line in f:
            pairs = line.split()
            hist_dict = {int(p.split(':')[0]):float(p.split(':')[1]) for p in pairs}
            for idx in hist_dict.keys():
                RGB_Hist[i_l,idx] = hist_dict[idx]
            i_l += 1
    return RGB_Hist

def read_caps(fname):
    """Load the captions into a dataframe"""
    vn = []
    cap = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            vn.append(pairs[0])
            cap.append(pairs[1])
        df['video']=vn
        df['caption']=cap
    return df

def Get_score(Y_pred,Y_true):
    '''Calculate the Spearmann"s correlation coefficient'''
    Y_pred = np.squeeze(Y_pred)
    Y_true = np.squeeze(Y_true)
    if Y_pred.shape != Y_true.shape:
        print('Input shapes don\'t match!')
    else:
        if len(Y_pred.shape) == 1:
            Res = pd.DataFrame({'Y_true':Y_true,'Y_pred':Y_pred})
            score_mat = Res[['Y_true','Y_pred']].corr(method='spearman',min_periods=1)
            print('The Spearman\'s correlation coefficient is: %.3f' % score_mat.iloc[1][0])
        else:
            for ii in range(Y_pred.shape[1]):
                Get_score(Y_pred[:,ii],Y_true[:,ii])


# Read in Files

## Ground Truth

In [None]:
groundTruthDF = pd.read_csv('/content/drive/My Drive/CA684_Assignment/Dev-set/Ground-truth/ground-truth.csv')
groundTruthDF['video'] =  groundTruthDF['video'].apply(lambda x: x.split('.')[0]) # remove the file extension from filename
groundTruthDF = groundTruthDF.drop(columns=['nb_short-term_annotations','nb_long-term_annotations']) # drop the unnecessary columnns
groundTruthDF.head()


Unnamed: 0,video,short-term_memorability,long-term_memorability
0,video3,0.924,0.846
1,video4,0.923,0.667
2,video6,0.863,0.7
3,video8,0.922,0.818
4,video10,0.95,0.9


## C3D & HMP

In [None]:
# read in C3D 
C3D_List = {}
for filename in sorted(os.listdir('/content/drive/My Drive/CA684_Assignment/Dev-set/C3D')):
    C3D_List[filename] = read_C3D('/content/drive/My Drive/CA684_Assignment/Dev-set/C3D/'+ filename) # read in C3D

C3D_FeatureVector = pd.DataFrame(C3D_List.items(),columns=['video', 'C3DValues'] )
C3D_FeatureVector.head()


#read in HMP
HMP_List = {} 
for filename in sorted(os.listdir('/content/drive/My Drive/CA684_Assignment/Dev-set/HMP')):
    HMP_List[filename] = read_HMP('/content/drive/My Drive/CA684_Assignment/Dev-set/HMP/'+ filename) # read in HMP
    
HMP_FeatureVector = pd.DataFrame(HMP_List.items(),columns=['video', 'HMPValues'] ) 
HMP_FeatureVector.head()



Unnamed: 0,video,HMPValues
0,video10.txt,"[0.005026, 0.001356, 5.5e-05, 0.0, 0.000665, 2..."
1,video100.txt,"[0.019473, 0.005004, 9.8e-05, 0.0, 0.001991, 3..."
2,video1001.txt,"[0.00725, 0.002722, 5e-05, 0.0, 0.00107, 5.5e-..."
3,video1003.txt,"[0.071224, 0.011862, 0.00032, 0.0, 0.00718, 0...."
4,video1004.txt,"[0.053318, 0.009491, 0.000454, 0.0, 0.004754, ..."


## LBP

In [None]:
#read in lbp
LBP_List = {}
for filename in sorted(os.listdir('/content/drive/My Drive/CA684_Assignment/Dev-set/LBP')):
    LBP_List[filename] = read_C3D('/content/drive/My Drive/CA684_Assignment/Dev-set/LBP/'+ filename) 

LBP_FeatureVector = pd.DataFrame(LBP_List.items(),columns=['video', 'LBPValues'] )
LBP_FeatureVector.head()

Unnamed: 0,video,LBPValues
0,video10-0.txt,"[0.02525029, 0.00485774, 0.00489198, 0.0038623..."
1,video10-112.txt,"[0.02204427, 0.00510851, 0.00511767, 0.0041420..."
2,video10-56.txt,"[0.02373216, 0.00531973, 0.00533999, 0.0046180..."
3,video100-0.txt,"[0.01105228, 0.00367236, 0.00343943, 0.0023119..."
4,video100-112.txt,"[0.00998071, 0.00322242, 0.00303434, 0.0024368..."


In [None]:
l = LBP_FeatureVector.copy()
l['video'] = l['video'].apply(lambda x:x.split("-")[0]) # get rid of file extension

l['lenVals'] = l['LBPValues'].apply(lambda x : len(x)) # make column of the length of the arrays
maxlen = l['lenVals'].max() # find the max array

l['LBPValues'] = l['LBPValues'].apply(lambda x : np.pad(x,(0, maxlen - len(x))) if len(x) < maxlen else x ) # make sure all lists are the of the max size (122)


uniqueVidNames = l['video'].unique() # get a list of unique names

# make a new Dataframe with [Video, Sum of LBPValues]
GroupedVideosLBP = {}
for name in uniqueVidNames:
  result = np.zeros(122)
  for values in l[l['video'] == name]['LBPValues']:
    result += np.array(values)  # add up vals
  GroupedVideosLBP[name] = result


GroupedLBP = pd.DataFrame(GroupedVideosLBP.items(),columns=['video', 'LBPValues'])

GroupedLBP.head()




Unnamed: 0,video,LBPValues
0,video10,"[0.07102672, 0.01528598, 0.015349640000000001,..."
1,video100,"[0.031912140000000006, 0.01021509, 0.00962288,..."
2,video1001,"[0.04570203, 0.014482430000000001, 0.01225585,..."
3,video1003,"[0.02102672, 0.0044164700000000005, 0.0038016,..."
4,video1004,"[0.03755113, 0.00602624, 0.00574605, 0.0051658..."


# Combining features

## Combine C3D and HMP

In [None]:

C3D_HMP_Combined = C3D_FeatureVector.merge(HMP_FeatureVector,on='video') # join C3D and HMP on video name
C3D_HMP_Combined['video'] = C3D_FeatureVector['video'].apply(lambda x: x.split('.')[0]) # remove the file extension 

CombinedFeatures = C3D_HMP_Combined.merge(GroupedLBP,on='video') # join with LBP

GroundCombined = CombinedFeatures.merge(groundTruthDF,on='video') # join the C3D/HMP/LBP combined dataframe with the ground truth dataframe on video name
GroundCombined.head()


Unnamed: 0,video,C3DValues,HMPValues,LBPValues,short-term_memorability,long-term_memorability
0,video10,"[9.006e-05, 0.00061494, 0.00343634, 0.00128092...","[0.005026, 0.001356, 5.5e-05, 0.0, 0.000665, 2...","[0.07102672, 0.01528598, 0.015349640000000001,...",0.95,0.9
1,video100,"[0.00231174, 0.00996551, 0.02743902, 5.85e-06,...","[0.019473, 0.005004, 9.8e-05, 0.0, 0.001991, 3...","[0.031912140000000006, 0.01021509, 0.00962288,...",0.951,0.889
2,video1001,"[0.46803489, 0.4895013, 7.944e-05, 9.68e-06, 2...","[0.00725, 0.002722, 5e-05, 0.0, 0.00107, 5.5e-...","[0.04570203, 0.014482430000000001, 0.01225585,...",0.899,1.0
3,video1003,"[0.01403566, 0.00049223, 0.00023311, 2.793e-05...","[0.071224, 0.011862, 0.00032, 0.0, 0.00718, 0....","[0.02102672, 0.0044164700000000005, 0.0038016,...",0.867,0.846
4,video1004,"[0.04673432, 0.00086782, 5.25e-06, 8.05e-06, 2...","[0.053318, 0.009491, 0.000454, 0.0, 0.004754, ...","[0.03755113, 0.00602624, 0.00574605, 0.0051658...",0.753,0.571


## Combine C3D/HMP with LBP

In [None]:
featuresCopy = GroundCombined[['C3DValues','HMPValues','LBPValues']].copy() # make a copy of the two features

featuresCopy['C3DValues'] = featuresCopy['C3DValues'].apply(lambda x: np.pad(x,(0,5974),'constant')) # pad the C3D column with zeros to match the shape of the HMP 
featuresCopy['LBPValues'] = featuresCopy['LBPValues'].apply(lambda x: np.pad(x,(0,5953),'constant')) # pad the LBP column with zeros to match the shape of the HMP 

featuresCopy['Combined'] = featuresCopy['C3DValues'] + featuresCopy['HMPValues'] + featuresCopy['LBPValues'] # add the three columns together
featuresCopy['Combined'].head()

0    [0.07614278000000001, 0.01725692, 0.01884098, ...
1    [0.05369688, 0.0251846, 0.0371599, 0.00738432,...
2    [0.5209869199999999, 0.50670573, 0.01238529, 0...
3    [0.10628638, 0.0167707, 0.00435471, 0.00289734...
4    [0.13760345000000002, 0.01638506, 0.0062052999...
Name: Combined, dtype: object

# Split Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import utils

X = featuresCopy['Combined'].tolist()
y = GroundCombined[['short-term_memorability','long-term_memorability']].values

X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size=0.20, random_state=42)


# Create and Train model

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor


#setting parameters
rf = RandomForestRegressor(n_estimators=20,max_depth=10,random_state=50,verbose=2)

#training dataset
rf.fit(X_train,Y_train)

predictions = rf.predict(X_test)

Get_score(predictions,Y_test) # get Spearman's correlation coefficient


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 20


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.0s remaining:    0.0s


building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20
The Spearman's correlation coefficient is: 0.282
The Spearman's correlation coefficient is: 0.131


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished


## Save model

In [None]:
import pickle
with open("/content/drive/My Drive/rf_model.pkl", "wb") as f:
 pickle.dump(rf, f)

## Load Model

In [None]:
with open("/content/drive/My Drive/rf_model.pkl", "rb") as f:
 model = pickle.load(f)

# Test Set 

## Process Test-set data


In [None]:
TestGroundTruthDF = pd.read_csv('/content/drive/My Drive/CA684_Assignment/Test-set/Ground-truth_test/ground_truth_template.csv')
TestGroundTruthDF.head()


Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations
0,7494,,33,,12
1,7495,,34,,10
2,7496,,32,,13
3,7497,,33,,10
4,7498,,33,,10


In [None]:
Test_C3D_List = {}
for filename in sorted(os.listdir('/content/drive/My Drive/CA684_Assignment/Test-set/C3D_test')):
    Test_C3D_List[filename] = read_C3D('/content/drive/My Drive/CA684_Assignment/Test-set/C3D_test/'+ filename) # read in C3D

Test_C3D_FeatureVector = pd.DataFrame(Test_C3D_List.items(),columns=['video', 'C3DValues'] )
Test_C3D_FeatureVector.head()


#read in HMP
Test_HMP_List = {} 
for filename in sorted(os.listdir('/content/drive/My Drive/CA684_Assignment/Test-set/HMP_test')):
    Test_HMP_List[filename] = read_HMP('/content/drive/My Drive/CA684_Assignment/Test-set/HMP_test/'+ filename) # read in HMP
    
Test_HMP_FeatureVector = pd.DataFrame(Test_HMP_List.items(),columns=['video', 'HMPValues'] ) 
Test_HMP_FeatureVector.head()


Unnamed: 0,video,HMPValues
0,video10000.txt,"[0.127454, 0.020965, 0.000221, 0.0, 0.009964, ..."
1,video10001.txt,"[0.015911, 0.003141, 0.000124, 0.0, 0.001083, ..."
2,video10002.txt,"[0.041081, 0.013914, 0.000198, 0.0, 0.002648, ..."
3,video10003.txt,"[0.047053, 0.009497, 0.000137, 0.0, 0.00867, 9..."
4,video10004.txt,"[0.003161, 0.00033, 0.0, 0.0, 0.000189, 0.0, 0..."


In [None]:
#read in lbp
Test_LBP_List = {}
for filename in sorted(os.listdir('/content/drive/My Drive/CA684_Assignment/Test-set/LBP_test')):
    Test_LBP_List[filename] = read_C3D('/content/drive/My Drive/CA684_Assignment/Test-set/LBP_test/'+ filename) 

Test_LBP_FeatureVector = pd.DataFrame(Test_LBP_List.items(),columns=['video', 'LBPValues'] )
Test_LBP_FeatureVector.head()

Unnamed: 0,video,LBPValues
0,video10000-0.txt,"[0.01514323, 0.00311343, 0.00233989, 0.0014096..."
1,video10000-112.txt,"[0.01459828, 0.00366271, 0.00257813, 0.0016859..."
2,video10000-56.txt,"[0.0160055, 0.00362847, 0.00271557, 0.00198544..."
3,video10001-0.txt,"[0.01513503, 0.00442515, 0.00394483, 0.0028785..."
4,video10001-112.txt,"[0.0158941, 0.00302421, 0.0033912, 0.0032065, ..."


In [None]:
l_Test = Test_LBP_FeatureVector.copy()
l_Test['video'] = l_Test['video'].apply(lambda x:x.split("-")[0]) # get rid of file extension

l_Test['lenVals'] = l_Test['LBPValues'].apply(lambda x : len(x)) # make column of the length of the arrays
maxlen = l_Test['lenVals'].max() # find the max array

l_Test['LBPValues'] = l_Test['LBPValues'].apply(lambda x : np.pad(x,(0, maxlen - len(x))) if len(x) < maxlen else x ) # make sure all lists are the of the max size (122)


uniqueVidNamesTest = l_Test['video'].unique() # get a list of unique names

# make a new Dataframe with [Video, Sum of LBPValues]
GroupedVideosLBPTest = {}
for name in uniqueVidNamesTest:
  result = np.zeros(122)
  for values in l_Test[l_Test['video'] == name]['LBPValues']:
    result += np.array(values)  # add up vals
  GroupedVideosLBPTest[name] = result


GroupedLBPTest = pd.DataFrame(GroupedVideosLBPTest.items(),columns=['video', 'LBPValues'])

GroupedLBPTest.head()


Unnamed: 0,video,LBPValues
0,video10000,"[0.04574701, 0.01040461, 0.007633590000000001,..."
1,video10001,"[0.04634163, 0.0107364, 0.010868530000000001, ..."
2,video10002,"[0.03803337, 0.00661507, 0.00577016, 0.0042202..."
3,video10003,"[0.04255836, 0.015097899999999997, 0.010598469..."
4,video10004,"[0.03731337, 0.0065181300000000005, 0.00695457..."


In [None]:
#combine C3D and HMP
C3D_HMP_CombinedTest = Test_C3D_FeatureVector.merge(Test_HMP_FeatureVector,on='video') # join C3D and HMP on video name
C3D_HMP_CombinedTest['video'] = Test_C3D_FeatureVector['video'].apply(lambda x: x.split('.')[0]) # remove the file extension 

TestCombinedFeatures = C3D_HMP_CombinedTest.merge(GroupedLBPTest,on='video') # join with LBP
TestCombinedFeatures.head()

Unnamed: 0,video,C3DValues,HMPValues,LBPValues
0,video10000,"[0.01793277, 0.0177311, 0.0032704, 3.46e-06, 1...","[0.127454, 0.020965, 0.000221, 0.0, 0.009964, ...","[0.04574701, 0.01040461, 0.007633590000000001,..."
1,video10001,"[0.02396697, 0.00180796, 1.783e-05, 6e-08, 1e-...","[0.015911, 0.003141, 0.000124, 0.0, 0.001083, ...","[0.04634163, 0.0107364, 0.010868530000000001, ..."
2,video10002,"[0.01869615, 0.86258429, 5e-07, 1.12e-06, 1.46...","[0.041081, 0.013914, 0.000198, 0.0, 0.002648, ...","[0.03803337, 0.00661507, 0.00577016, 0.0042202..."
3,video10003,"[0.00055588, 0.00024811, 0.24000312, 3.271e-05...","[0.047053, 0.009497, 0.000137, 0.0, 0.00867, 9...","[0.04255836, 0.015097899999999997, 0.010598469..."
4,video10004,"[0.05058656, 0.00623968, 0.11702564, 0.0005364...","[0.003161, 0.00033, 0.0, 0.0, 0.000189, 0.0, 0...","[0.03731337, 0.0065181300000000005, 0.00695457..."


In [None]:
TestfeaturesCopy = TestCombinedFeatures[['C3DValues','HMPValues','LBPValues']].copy() # make a copy of the two features

TestfeaturesCopy['C3DValues'] = TestfeaturesCopy['C3DValues'].apply(lambda x: np.pad(x,(0,5974),'constant')) # pad the C3D column with zeros to match the shape of the HMP 
TestfeaturesCopy['LBPValues'] = TestfeaturesCopy['LBPValues'].apply(lambda x: np.pad(x,(0,5953),'constant')) # pad the LBP column with zeros to match the shape of the HMP 

TestfeaturesCopy['Combined'] = TestfeaturesCopy['C3DValues'] + TestfeaturesCopy['HMPValues'] + TestfeaturesCopy['LBPValues'] # add the three columns together
TestfeaturesCopy['Combined'].head()

0    [0.19113378, 0.04910071, 0.011124990000000001,...
1    [0.08621960000000001, 0.015685360000000002, 0....
2    [0.09781052, 0.88311336, 0.00596866, 0.0042213...
3    [0.09016724000000001, 0.02484301, 0.25073859, ...
4    [0.09106093, 0.01308781, 0.12398021000000001, ...
Name: Combined, dtype: object

## Generate Predictions for Test-set


In [None]:

Test_X_train = featuresCopy['Combined'].tolist() # Dev-set 6000
Test_X_test  = TestfeaturesCopy['Combined'].tolist() # Test-set 2000
Test_Y_train =  groundTruthDF[['short-term_memorability','long-term_memorability']].values # Dev-set 6000
Test_Y_test  = []


In [None]:
model.fit(Test_X_train,Test_Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 20


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.8s remaining:    0.0s


building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  4.2min finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=50, verbose=2, warm_start=False)

In [None]:
Predictions = model.predict(Test_X_test)
print(Predictions)

[[0.85743188 0.76046372]
 [0.86306091 0.79709461]
 [0.86303327 0.79175308]
 ...
 [0.84016385 0.74860854]
 [0.8097717  0.70696757]
 [0.86662967 0.78026018]]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.0s finished


In [None]:
PredictionsDf = pd.DataFrame(Predictions,columns=['Short Term scores','Long Term scores'])
PredictionsDf.head()

Unnamed: 0,Short Term scores,Long Term scores
0,0.857432,0.760464
1,0.863061,0.797095
2,0.863033,0.791753
3,0.855796,0.785355
4,0.861224,0.759943


In [None]:
TestGroundTruthDF['short-term_memorability'] = PredictionsDf['Short Term scores']
TestGroundTruthDF['long-term_memorability'] = PredictionsDf['Long Term scores']

In [None]:
TestGroundTruthDF.head()

Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations
0,7494,0.857432,33,0.760464,12
1,7495,0.863061,34,0.797095,10
2,7496,0.863033,32,0.791753,13
3,7497,0.855796,33,0.785355,10
4,7498,0.861224,33,0.759943,10


In [None]:
TestGroundTruthDF.to_csv('/content/drive/My Drive/Matthew_Nolan_16425716_predictions.csv')