## Connect to file system

To run the scrpt as it is, you need to create following folders in the home directory of google drive. 


*   /content/drive/My Drive/CA684_Assignment - shortcut to resource folderr
*   /content/drive/My Drive/upwork/
*   /content/drive/My Drive/upwork/test







In [1]:
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Import packages

In [2]:
!pip install pyprind



In [3]:
!pip install tqdm



In [0]:
import pandas as pd

from tensorflow.python.keras import Sequential
from tensorflow.python.keras import layers
from tensorflow.python.keras import regularizers
from tensorflow.python.keras.preprocessing.text import Tokenizer

import numpy as np
from string import punctuation
import pyprind
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import lightgbm as lgb
from sklearn import preprocessing


# 1. Loading ground truth

In [0]:
# load ground truth labels
label_path = './CA684_Assignment/Dev-set/Ground-truth/'
labels=pd.read_csv(label_path+'ground-truth.csv')

# 2. Loading data

This section contains all the functions to extract features from the goodle drive as pandas dataframes. To reduce the loading time, dataframes are saved seperately on the gdrive. 

### 2.1 Captions

In [0]:
# load labels and captions
def read_caps(fname):
    """Load the captions into a dataframe"""
    vn = []
    cap = []
    df = pd.DataFrame();
    with open(fname) as f:
        for line in f:
            pairs = line.split()
            vn.append(pairs[0])
            cap.append(pairs[1])
        df['video']=vn
        df['caption']=cap
    return df

In [0]:
# Common tokenizer,  is trained from the text in training set
# tokenizer used for validation set and test set
cap_path = './CA684_Assignment/Dev-set/Captions/dev-set_video-captions.txt'
df_cap=read_caps(cap_path)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(df_cap.caption.values))

In [0]:

def get_features_captions(pkl_dir, raw_dir):
  '''
  If caption features are saved before return the dataframe
  else read text file, save as a dataframe and return it
  '''
  try:
    features_captions = pd.read_pickle(pkl_dir)
    return features_captions
  except:
    cap_path = raw_dir
    df_cap=read_caps(cap_path)

    captions_one_hot_res = tokenizer.texts_to_matrix(list(df_cap.caption.values),mode='binary')
    features_captions = pd.DataFrame(np.concatenate((df_cap.video.values.reshape((-1,1)), captions_one_hot_res), axis=1))
    features_captions.columns = ['video'] + ['caption_token_{}'.format(i) for i in range(captions_one_hot_res.shape[1])]
    features_captions.to_pickle(pkl_dir)
    return features_captions

  


### 2.2 Inception features

In [0]:
def parse_inception_feature(s):
    pairs = s.strip().split(' ')
    pairs = [i.split(':') for i in pairs]
    return {int(k): float(v) for k, v in pairs}

def expand_inception_feature(d):
  feature = np.zeros(1000)
  for k, v in d.items():
    feature[k] = v
  return feature


In [0]:
def get_inception_features_layer(layer_num, pkl_dir, raw_dir):
  '''
  If inception features are saved before return the dataframe
  else read text file, save as a dataframe and return it
  layer num - the frame number features needed
  '''
  try:
    features_inception_0 = pd.read_pickle(pkl_dir.format(layer_num))
    return features_inception_0
  except:
    inception_features_dict = []
    inception_path = Path(raw_dir)
    for file in tqdm(list(inception_path.glob('*-{}.txt'.format(layer_num)))):
      # only process first frame of each video
      key = file.with_suffix('.webm').name.replace('-{}'.format(layer_num), '')
      inception_features_dict.append([key] + expand_inception_feature(
        parse_inception_feature(
          file.open().read())).tolist())

    features_inception_0 = pd.DataFrame(inception_features_dict)
    features_inception_0.columns = ['video'] + ['inception_{}_{}'.format(layer_num, i) for i in range(1000)]
    features_inception_0.to_pickle(pkl_dir.format(layer_num))

### 2.3 C3D features

In [0]:
def read_C3D(fname):
    """Scan vectors from file"""
    with open(fname) as f:
        for line in f:
            C3D =[float(item) for item in line.split()] # convert to float type, using default separator
    return C3D

In [0]:
def get_c3d_features(pkl_dir, raw_dir):
  '''
  If c3d features are saved before return the dataframe
  else read text file, save as a dataframe and return it
  '''
  try:
    features_c3d = pd.read_pickle(pkl_dir)
    return features_c3d
  except:
    c3d_features_dict = []
    c3d_path = Path(raw_dir)
    for file in tqdm(list(c3d_path.glob('*.txt'))):
      # only process first frame of each video
      key = file.with_suffix('.webm').name
      c3d_features_dict.append([key] + read_C3D(file))

    features_c3d = pd.DataFrame(c3d_features_dict)
    features_c3d.columns = ['video'] + ['c3d_{}'.format(i) for i in range(101)]
    features_c3d.to_pickle(pkl_dir)
    return features_c3d


### 2.4 HMP features

In [0]:
def read_HMP(fname):
    """Scan HMP(Histogram of Motion Patterns) features from file"""
    with open(fname) as f:
        for line in f:
            pairs=line.split()
            HMP_temp = { int(p.split(':')[0]) : float(p.split(':')[1]) for p in pairs}
    # there are 6075 bins, fill zeros
    HMP = np.zeros(6075)
    for idx in HMP_temp.keys():
        HMP[idx-1] = HMP_temp[idx]            
    return HMP

In [0]:
def get_hmp_features(pkl_dir, raw_dir):
  '''
  If hmp features are saved before return the dataframe
  else read text file, save as a dataframe and return it
  '''
  try:
    features_hmp = pd.read_pickle(pkl_dir)
    return features_hmp
  except:
    hmp_features_dict = []
    hmp_path = Path(raw_dir)
    for file in tqdm(list(hmp_path.glob('*.txt'))):
      # only process first frame of each video
      key = file.with_suffix('.webm').name
      hmp_features_dict.append([key] + read_HMP(file).tolist())

    features_hmp = pd.DataFrame(hmp_features_dict)
    features_hmp.columns = ['video'] + ['hmp_{}'.format(i) for i in range(6075)]
    features_hmp.to_pickle(pkl_dir)
    return features_hmp


### 2.5 Color historgram features

In [0]:
def read_ColorHistogram(fname):
    """Scan Color Histogram from file
    Input file contains RGB histogram,
    Return a matrix of (3,256)"""
    RGB_Hist = np.zeros((3,256))
    with open(fname) as f:
        i_l = 0 # line index
        for line in f:
            pairs = line.split()
            hist_dict = {int(p.split(':')[0]):float(p.split(':')[1]) for p in pairs}
            for idx in hist_dict.keys():
                RGB_Hist[i_l,idx] = hist_dict[idx]
            i_l += 1
    return RGB_Hist

In [0]:
def hist_features_layer(layer_num, pkl_dir, raw_dir):
  '''
  If color histogram features are saved before return the dataframe
  else read text file, save as a dataframe and return it
  '''
  try:
    features_hist_0 = pd.read_pickle(pkl_dir.format(layer_num))
    return features_hist_0
  except:
    hist_features_dict = []
    hist_path = Path(raw_dir)
    for file in tqdm(list(hist_path.glob('*-{}.txt'.format(layer_num)))):
      # only process first frame of each video
      key = file.with_suffix('.webm').name.replace('-{}'.format(layer_num), '')
      hist_features_dict.append([key] + read_ColorHistogram(file).flatten().tolist())

    features_hist = pd.DataFrame(hist_features_dict)
    features_hist.columns = ['video'] + ['hist_{}_{}'.format(layer_num, i) for i in range(768)]
    features_hist.to_pickle(pkl_dir.format(layer_num))
    return features_hist

### 2.6 Load features

In this section data is loaded fromt the functions implemented in previous section. Data form test and training folders are loaded and saved seperrately.

In [0]:
features_captions = get_features_captions('./upwork/features_captions.pkl', './CA684_Assignment/Dev-set/Captions/dev-set_video-captions.txt')
test_features_captions = get_features_captions('./upwork/test/features_captions.pkl', './CA684_Assignment/Test-set/Captions_test/test-set-1_video-captions.txt')

features_inception_0 = get_inception_features_layer(0, './upwork/features_inception_{}.pkl', './CA684_Assignment/Dev-set/InceptionV3')
test_features_inception_0 = get_inception_features_layer(0, './upwork/test/features_inception_{}.pkl', './CA684_Assignment/Test-set/Inception_test')

features_inception_56 = get_inception_features_layer(56, './upwork/features_inception_{}.pkl', './CA684_Assignment/Dev-set/InceptionV3')
test_features_inception_56 = get_inception_features_layer(56, './upwork/test/features_inception_{}.pkl', './CA684_Assignment/Test-set/Inception_test')

features_inception_112 = get_inception_features_layer(112, './upwork/features_inception_{}.pkl', './CA684_Assignment/Dev-set/InceptionV3')
test_features_inception_112 = get_inception_features_layer(112, './upwork/test/features_inception_{}.pkl', './CA684_Assignment/Test-set/Inception_test')

features_c3d = get_c3d_features('./upwork/features_c3d.pkl', './CA684_Assignment/Dev-set/C3D')
test_features_c3d = get_c3d_features('./upwork/test/features_c3d.pkl', './CA684_Assignment/Test-set/C3D_test')

features_hmp = get_hmp_features('./upwork/features_hmp.pkl', './CA684_Assignment/Dev-set/HMP')
test_features_hmp = get_hmp_features('./upwork/test/features_hmp.pkl', './CA684_Assignment/Test-set/HMP_test')

features_hist_0 = hist_features_layer(0, './upwork/features_hist_{}.pkl', './CA684_Assignment/Dev-set/ColorHistogram')
test_features_hist_0 = hist_features_layer(0, './upwork/test/features_hist_{}.pkl', './CA684_Assignment/Test-set/ColorHistogram_test')

features_hist_56 = hist_features_layer(56, './upwork/features_hist_{}.pkl', './CA684_Assignment/Dev-set/ColorHistogram')
test_features_hist_56 = hist_features_layer(56, './upwork/test/features_hist_{}.pkl', './CA684_Assignment/Test-set/ColorHistogram_test')

features_hist_112 = hist_features_layer(112, './upwork/features_hist_{}.pkl', './CA684_Assignment/Dev-set/ColorHistogram')
test_features_hist_112 = hist_features_layer(112, './upwork/test/features_hist_{}.pkl', './CA684_Assignment/Test-set/ColorHistogram_test')



# 3. Training data

Split and preprocess the data in this section. Before preprocessing all the features are merged into a single pandas dataframe. Then Split it to 0.2 ratio and finally min max scaler used to scale between 0 and 1

In [0]:
# merge all the feature dataframes using 'video' name index
all_features = [features_captions, features_c3d, features_hmp, features_inception_0, features_inception_56, features_inception_112, features_hist_0, features_hist_112]
all_features_columns = []
for df in all_features:
  df.set_index('video', inplace=True)
  all_features_columns += df.columns.to_list()

In [0]:
# index ground truth by video name
labels.set_index('video', inplace=True)

In [0]:
# merge datasets
full_dataset_pd = labels.copy()
for df in all_features:
  full_dataset_pd = pd.merge(full_dataset_pd, df, left_index=True, right_index=True)

In [0]:
# split dataset into validation and training sets with 0.2 rato
Y = full_dataset_pd[['short-term_memorability','long-term_memorability']].values # targets
X = full_dataset_pd[all_features_columns].values.astype(np.float32)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42) # random state for reproducability
len_token = X_train.shape[1]

In [0]:
# scale dataset to range [0,1]
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

# 4. Evaluate

Script to evaluate the predictions

In [0]:
def Get_score(Y_pred,Y_true):
    '''Calculate the Spearmann"s correlation coefficient'''
    Y_pred = np.squeeze(Y_pred)
    Y_true = np.squeeze(Y_true)
    if Y_pred.shape != Y_true.shape:
        print('Input shapes don\'t match!')
    else:
        if len(Y_pred.shape) == 1:
            Res = pd.DataFrame({'Y_true':Y_true,'Y_pred':Y_pred})
            score_mat = Res[['Y_true','Y_pred']].corr(method='spearman',min_periods=1)
            print('The Spearman\'s correlation coefficient is: %.3f' % score_mat.iloc[1][0])
        else:
            for ii in range(Y_pred.shape[1]):
                Get_score(Y_pred[:,ii],Y_true[:,ii])

# 5. Gradient boosting model

Two LGBM models are trained seperately for long term and short term memorability prediction

In [47]:
# Gradient boosting regressor 
# gbm_0 is for the short term memorability
gbm_0 = lgb.LGBMRegressor(num_leaves=55,
                        learning_rate=0.05,
                        early_stopping_rounds=5,
                        n_estimators=100)
gbm_0.fit(X_train, Y_train[:, 0],
        eval_set=[(X_test, Y_test[:, 0])],
        eval_metric='l2',
        early_stopping_rounds=5)
pred = gbm_0.predict(X_test)
Get_score(pred, Y_test[:, 0])



[1]	valid_0's l2: 0.00611871	valid_0's l2: 0.00611871
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.00603924	valid_0's l2: 0.00603924
[3]	valid_0's l2: 0.00598137	valid_0's l2: 0.00598137
[4]	valid_0's l2: 0.00593721	valid_0's l2: 0.00593721
[5]	valid_0's l2: 0.00587094	valid_0's l2: 0.00587094
[6]	valid_0's l2: 0.00582673	valid_0's l2: 0.00582673
[7]	valid_0's l2: 0.00578061	valid_0's l2: 0.00578061
[8]	valid_0's l2: 0.00574117	valid_0's l2: 0.00574117
[9]	valid_0's l2: 0.00570729	valid_0's l2: 0.00570729
[10]	valid_0's l2: 0.00566665	valid_0's l2: 0.00566665
[11]	valid_0's l2: 0.0056238	valid_0's l2: 0.0056238
[12]	valid_0's l2: 0.00560763	valid_0's l2: 0.00560763
[13]	valid_0's l2: 0.00557312	valid_0's l2: 0.00557312
[14]	valid_0's l2: 0.00554553	valid_0's l2: 0.00554553
[15]	valid_0's l2: 0.00550382	valid_0's l2: 0.00550382
[16]	valid_0's l2: 0.00548003	valid_0's l2: 0.00548003
[17]	valid_0's l2: 0.00545648	valid_0's l2: 0.00545648
[18]	valid_0's 

In [48]:
# Gradient boosting regressor 
# gbm_0 is for the long term memorability

gbm_1 = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm_1.fit(X_train, Y_train[:, 1],
        eval_set=[(X_test, Y_test[:, 1])],
        eval_metric='l1',
        early_stopping_rounds=5)
pred = gbm_0.predict(X_test)
Get_score(pred, Y_test[:, 1])

[1]	valid_0's l2: 0.0218202	valid_0's l1: 0.118063
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.0217785	valid_0's l1: 0.117918
[3]	valid_0's l2: 0.0217243	valid_0's l1: 0.117757
[4]	valid_0's l2: 0.0216927	valid_0's l1: 0.117605
[5]	valid_0's l2: 0.0216572	valid_0's l1: 0.117474
[6]	valid_0's l2: 0.021621	valid_0's l1: 0.117436
[7]	valid_0's l2: 0.0215804	valid_0's l1: 0.11729
[8]	valid_0's l2: 0.021567	valid_0's l1: 0.117227
[9]	valid_0's l2: 0.0215521	valid_0's l1: 0.117231
[10]	valid_0's l2: 0.0215815	valid_0's l1: 0.117317
[11]	valid_0's l2: 0.0216079	valid_0's l1: 0.117358
[12]	valid_0's l2: 0.0216016	valid_0's l1: 0.117292
[13]	valid_0's l2: 0.021612	valid_0's l1: 0.117321
Early stopping, best iteration is:
[8]	valid_0's l2: 0.021567	valid_0's l1: 0.117227
The Spearman's correlation coefficient is: 0.188


# 6. Regression models

In [106]:
from sklearn import linear_model
reg = linear_model.Ridge(alpha=120).fit(X_train, Y_train[:,0])
pred = reg.predict(X_test)
Get_score(pred, Y_test_new[:, 0])

The Spearman's correlation coefficient is: 0.409


In [111]:
from sklearn import linear_model
reg = linear_model.RidgeCV(alphas=[ 1, 10, 100, 1000, 10000]).fit(X_train, Y_train[:,0])
pred = reg.predict(X_test)
Get_score(pred, Y_test_new[:, 0])

The Spearman's correlation coefficient is: 0.408


# 7. Predict

In this section data from the test folder of the dataset is processed. Then feed them to the previously trained LGBM models and result saved to a CSV file. 

In [0]:
# Merge all the test dataset features into variable 'test_full_dataset_pd'
test_all_features = [test_features_captions, test_features_c3d, test_features_hmp, 
                test_features_inception_0, test_features_inception_56, 
                test_features_inception_112, test_features_hist_0, 
                test_features_hist_112]

# set the index for all the features
test_all_features_columns = []
for df in test_all_features:
  df.set_index('video', inplace=True)
  test_all_features_columns += df.columns.to_list()

# merging features
test_full_dataset_pd = test_all_features[0].copy()
for df in test_all_features[1:]:
  test_full_dataset_pd = pd.merge(test_full_dataset_pd, df, left_index=True, right_index=True)

test_X = test_full_dataset_pd[test_all_features_columns].values.astype(np.float32)
# scaling the dataset
test_X = min_max_scaler.transform(test_X)


In [0]:
short_term_pred = gbm_0.predict(test_X)
long_term_pred = gbm_1.predict(test_X)

In [0]:
pd.DataFrame({'short-term_memorability':short_term_pred, 'long-term_memorability':long_term_pred}).to_csv('./upwork/prediction.csv')