* word based representations
* model based representations
* variable length representations (Bi-LSTM)
* ULMFit
* BERT..

# Data

In [1]:
import time
import string 
import pickle
import itertools
import numpy as np
import pandas as pd
from pprint import pprint

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# from nltk.corpus import stopwords
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

from tqdm.notebook import tqdm
from nltk.tokenize import RegexpTokenizer
from gensim.models import KeyedVectors

In [2]:
import random

print ("Random number with seed 42")
random.seed(42)

Random number with seed 42


In [3]:
# df_st = pd.read_csv('me_2020/short_term_annotations_v2.csv', )
# df_lt = pd.read_csv('me_2020/long_term_annotations_v2.csv', )
df_text = pd.read_csv('me_2020/official_video_descriptions.csv', )
df_scores = pd.read_csv('me_2020/scores_v2.csv', )

In [4]:
len(df_text)

2191

In [5]:
# df_text.tail(8)

In [6]:
# df_scores.tail()

In [7]:
text_concat = df_text[['video_id','description']].groupby(['video_id'])['description'].transform(lambda x: '  '.join(x)).drop_duplicates()

In [8]:
len(text_concat)

590

In [9]:
deep_captions = [ l[8:].strip() for l in open('me_2020/deep_captions.txt')]
deep_captions[588]

'a man is sitting on a toilet and drinking from a water bottle'

In [10]:
df_data = df_scores.copy()
df_data['text'] = text_concat.values
df_data['short_term'] = df_data['part_1_scores']
df_data['long_term'] = df_data['part_2_scores']
df_data['deep_caption'] = deep_captions[:590]
df_data = df_data[['video_id', 'text', 'deep_caption', 'short_term', 'long_term']]

In [11]:
df_data.head(30)

Unnamed: 0,video_id,text,deep_caption,short_term,long_term
0,8,2 men on a stage hug and walk away two young ...,a man is dancing on stage with a band,0.75,0.57
1,26,deadpool drinking from a cup persons in costu...,a woman is sitting on a couch and talking,0.87,0.43
2,33,a soccer player shoots a ball into a tiny goal...,a soccer player is kicking a ball into the net,0.69,0.75
3,46,multiple chinese persons dance onstage on a p...,a band is performing on stage and the singer s...,0.87,0.57
4,64,an asian man in a clothing store embraces an o...,a man in a suit is dancing and singing,0.84,0.56
5,70,"at night, two women walks next to the children...",a group of men are dancing and singing,0.74,1.0
6,74,3 women spinning in circles 3 women change th...,a group of girls are dancing in a room,0.91,1.0
7,117,person throws a chain of lights indoors a per...,a man is singing and dancing in a room,0.89,0.9
8,139,2 asian men in a park at daytime hold each oth...,a man is dancing with a group of other men,0.82,0.8
9,143,a man plays a violin indoors inside a room a ...,a man is playing a guitar and singing into a m...,0.81,0.86


In [12]:
df_data.to_csv('me2020_data.csv')

# Feature extraction

In [13]:
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatizer.lemmatize('throws')

'throw'

In [15]:
# w2v = KeyedVectors.load_word2vec_format('../conceptnet/glove.6B/glove.w2v.6B.300d.txt')
# pickle.dump(w2v, open('../conceptnet/glove.6B/glove.w2v.6B.300d.pickle', 'wb'))

In [16]:
# text_concat = df_text[['video_id','description']].groupby(['video_id'])['description'].transform(lambda x: '  '.join(x)).drop_duplicates()

In [17]:
w2v = pickle.load(open('../conceptnet/glove.6B/glove.w2v.6B.300d.pickle', 'rb'))

In [18]:
stopwords = ["a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't", "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn", "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o", "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s", "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t", "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we", "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won", "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought", "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're", "we've", "what's", "when's", "where's", "who's", "why's", "would"]

def tokenize(s):
    numbers = {'2': 'two', '3': 'three', '4': 'four'}
    s = ''.join(c for c in s if c not in string.punctuation or c == ' ').lower()
    t = RegexpTokenizer(r'\w+').tokenize(s)
    t = [lemmatizer.lemmatize(w) if w not in numbers else numbers[w] for w in t if w not in stopwords]
    return ' '.join(t)

def tokenize2(s): # keep stopwords and don't lemmatize
    numbers = {'2': 'two', '3': 'three', '4': 'four'}
    s = ''.join(c for c in s if c not in string.punctuation or c == ' ').lower()
    t = RegexpTokenizer(r'\w+').tokenize(s)
    t = [w if w not in numbers else numbers[w] for w in t]
    return ' '.join(t)

def tokenize3(s): # remove duplicates
    numbers = {'2': 'two', '3': 'three', '4': 'four'}
    s = ''.join(c for c in s if c not in string.punctuation or c == ' ').lower()
    t = RegexpTokenizer(r'\w+').tokenize(s)
    t = [w if w not in numbers else numbers[w] for w in t if w not in stopwords]
    return ' '.join(set(t))


In [19]:
df_data = pd.read_csv('me2020_data.csv')

In [20]:
df_data.head()

Unnamed: 0.1,Unnamed: 0,video_id,text,deep_caption,short_term,long_term
0,0,8,2 men on a stage hug and walk away two young ...,a man is dancing on stage with a band,0.75,0.57
1,1,26,deadpool drinking from a cup persons in costu...,a woman is sitting on a couch and talking,0.87,0.43
2,2,33,a soccer player shoots a ball into a tiny goal...,a soccer player is kicking a ball into the net,0.69,0.75
3,3,46,multiple chinese persons dance onstage on a p...,a band is performing on stage and the singer s...,0.87,0.57
4,4,64,an asian man in a clothing store embraces an o...,a man in a suit is dancing and singing,0.84,0.56


In [21]:
df_data['content'] = df_data['text'] + '  ' + df_data['deep_caption']

In [29]:
corpus = df_data.text.values
corpus_tokenized = [tokenize3(s) for s in corpus]
corpus_tokenized2 = [tokenize2(s) for s in corpus]
# corpus_tokenized3 = [tokenize3(s) for s in corpus]

In [23]:
vectorizer = TfidfVectorizer(min_df=4, stop_words='english', ngram_range=(1, 2))
vectorizer.fit(corpus_tokenized)

X_train = vectorizer.transform(corpus)

print(X_train.shape)
print(X_train.toarray())

(590, 909)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [24]:
X_tfidf = []
X_w2v   = []
Y = []

for i, entry in tqdm(df_data.iterrows(), total=len(df_data)):
    text = tokenize(entry['text'])
    y = (entry['short_term'], entry['long_term'])
    
    x_tfidf = vectorizer.transform([text]).toarray()[0]
    words = [word for word in text.split(' ') if word in w2v]
    x_w2v = np.zeros([300]) if not words else np.mean([w2v[word] for word in words], axis=0)
    
    X_tfidf.append(x_tfidf)
    X_w2v.append(x_w2v)
    Y.append(y)

HBox(children=(FloatProgress(value=0.0, max=590.0), HTML(value='')))




In [25]:
X_tfidf = np.array(X_tfidf)
X_w2v = np.array(X_w2v)
Y = np.array(Y)

X_tfidf.shape, X_w2v.shape, Y.shape

((590, 909), (590, 300), (590, 2))

In [26]:
# !git clone git@github.com:UKPLab/sentence-transformers.git
# !cd sentence-transformers
# !pip install .

In [27]:
import scipy
import numpy as np
from sentence_transformers import models, SentenceTransformer

# model = SentenceTransformer('distiluse-base-multilingual-cased')
sbert1 = SentenceTransformer('distiluse-base-multilingual-cased')
sbert2 = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')




  return torch._C._cuda_getDeviceCount() > 0


In [30]:
corpus_tokenized2[:3]

['two men on a stage hug and walk away two young man are standing on a stage embracing each other and one claps the other ones ass',
 'deadpool drinking from a cup persons in costumes dance',
 'a soccer player shoots a ball into a tiny goal on a soccer field at daytime soccer ball rolling into a small goal on a soccer field at daytime']

In [31]:
%%time
bert1_embeddings = sbert1.encode(corpus_tokenized2)
bert2_embeddings = sbert2.encode(corpus_tokenized2)

CPU times: user 3min 35s, sys: 1.05 s, total: 3min 36s
Wall time: 58.5 s


In [32]:
bert1_embeddings.shape, bert2_embeddings.shape

((590, 512), (590, 768))

# Models

In [33]:
from sklearn.model_selection import KFold
from scipy.stats import spearmanr
spearman = lambda x,y: spearmanr(x, y).correlation

In [34]:
def enumerate_models(models):
    instances = []
    for model_name, (model, hyperparameters) in models.items():
        configs = {}
        if len(hyperparameters) > 0:
            params, vals = list(hyperparameters.keys()), list(hyperparameters.values())
            configs = [dict(zip(params, vv)) for vv in list(itertools.product(*vals))]
            for config in configs:
                m = model(**config)
                instances.append(m)
        else:
            instances.append(model())
    return instances

In [35]:
regression_models = {
    # 'LogisticRegression': (LogisticRegression, {"C": [1e3, 1, 1e-3], "penalty": ['l1', 'l2', 'elasticnet']}),
    # 'LinearRegression': (LinearRegression, {}),
    #'MLPRegressor': (MLPRegressor, {'alpha': [1e-3,  1e-7], 'hidden_layer_sizes': [(10,), (100,)]}), # 1e-5,, (50,), 
    # 'SGDRegressor': (SGDRegressor, {'alpha': [0.0001, 0.1,]}),
    'SVR': (SVR, {'kernel': ['linear', 'rbf'], "C": [1e-3, 1e-4, 1e-5, 1e-7], "gamma": ["scale"]})
}
len(enumerate_models(regression_models))

8

In [36]:
X = {'w2v':X_w2v}# 'tfidf': X_tfidf, 'w2v':X_w2v, 'bert1': bert1_embeddings, 'bert2': bert2_embeddings}
Y_st = Y[:, 0]
Y_lt = Y[:, 1]

In [37]:
folds = {}
print('Short term memorability prediction:'.upper())

for k in X:
    folds[k] = {}
    print('\nFeatures:', k.upper(), '\n')
    for regressor in enumerate_models(regression_models):
        model_name = str(regressor)
        folds[k][model_name] = []
        kf = KFold(n_splits=6, random_state=42)
        print('Training', model_name, '..')
        for i, (train_index, test_index) in enumerate(kf.split(X[k])):
            print('Fold #'+ str(i), end='.. ')
            t = time.time()
            X_train, X_test = X[k][train_index], X[k][test_index]
            y_train, y_test = Y_st[train_index], Y_st[test_index]
            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            folds[k][model_name].append((y_pred, y_test))
            print(f'done! ({(time.time() - t):.2} secs). Spearman: {spearman(y_pred, y_test):.2}')
            
            t = time.time()

SHORT TERM MEMORABILITY PREDICTION:

Features: W2V 

Training SVR(C=0.001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) ..
Fold #0.. done! (0.059 secs). Spearman: 0.19
Fold #1.. done! (0.12 secs). Spearman: 0.32
Fold #2.. done! (0.056 secs). Spearman: 0.31
Fold #3.. done! (0.052 secs). Spearman: 0.17
Fold #4.. done! (0.05 secs). Spearman: 0.24
Fold #5.. done! (0.044 secs). Spearman: 0.22
Training SVR(C=0.0001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) ..
Fold #0.. done! (0.044 secs). Spearman: 0.23
Fold #1.. done! (0.042 secs). Spearman: 0.3
Fold #2.. done! (0.043 secs). Spearman: 0.29
Fold #3.. done! (0.043 secs). Spearman: 0.17
Fold #4.. done! (0.041 secs). Spearman: 0.25
Fold #5.. done! (0.042 secs). Spearman: 0.24
Training SVR(C=1e-05, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma

In [38]:
folds_lt = {}
print('Long term memorability prediction:'.upper())

for k in X:
    folds_lt[k] = {}
    print('\nFeatures:', k.upper(), '\n')
    for regressor in enumerate_models(regression_models):
        model_name = str(regressor)
        folds_lt[k][model_name] = []
        kf = KFold(n_splits=6, random_state=42)
        print('Training', model_name, '..')
        for i, (train_index, test_index) in enumerate(kf.split(X[k])):
            print('Fold #'+ str(i), end='.. ')
            t = time.time()
            X_train, X_test = X[k][train_index], X[k][test_index]
            y_train, y_test = Y_lt[train_index], Y_lt[test_index]
            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            folds_lt[k][model_name].append((y_pred, y_test))
            print(f'done! ({(time.time() - t):.2} secs). Spearman: {spearman(y_pred, y_test):.2}')
            
            t = time.time()

LONG TERM MEMORABILITY PREDICTION:

Features: W2V 

Training SVR(C=0.001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) ..
Fold #0.. done! (0.07 secs). Spearman: 0.1
Fold #1.. done! (0.058 secs). Spearman: 0.24
Fold #2.. done! (0.059 secs). Spearman: 0.029
Fold #3.. done! (0.071 secs). Spearman: 0.11
Fold #4.. done! (0.061 secs). Spearman: 0.23
Fold #5.. done! (0.067 secs). Spearman: 0.09
Training SVR(C=0.0001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) ..
Fold #0.. done! (0.072 secs). Spearman: 0.11
Fold #1.. done! (0.061 secs). Spearman: 0.23
Fold #2.. done! (0.062 secs). Spearman: 0.05
Fold #3.. done! (0.064 secs). Spearman: 0.11
Fold #4.. done! (0.063 secs). Spearman: 0.2
Fold #5.. done! (0.059 secs). Spearman: 0.12
Training SVR(C=1e-05, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma

In [63]:
X_train[0]

array([-2.19063044e-01,  1.45368934e-01, -8.50737542e-02, -2.10372999e-01,
       -1.34135514e-01,  1.24508686e-01, -6.43252209e-02, -5.17396107e-02,
        9.57071558e-02, -8.63436818e-01,  1.36617497e-01, -5.75112402e-02,
       -2.63032019e-01, -8.97753164e-02,  1.77866668e-01,  6.64938092e-02,
       -2.22639516e-01, -7.07390085e-02,  7.00964555e-02,  8.10312852e-03,
       -1.54150411e-01,  1.43767998e-01,  1.09692194e-01,  1.03234716e-01,
       -5.83458841e-02, -6.36675432e-02,  6.08831942e-02, -9.46811866e-03,
       -3.16653326e-02,  6.22601882e-02,  8.41747504e-04,  6.20156787e-02,
        3.21646854e-02, -5.21318987e-04, -7.99848020e-01,  1.57144621e-01,
        9.71353203e-02,  1.96526255e-02,  1.27128670e-02,  4.96831834e-02,
        1.73308149e-01, -1.20428286e-01, -1.51762381e-01,  1.49401249e-02,
        3.21771875e-02,  2.18385071e-01,  3.08026493e-01,  3.70095596e-02,
       -4.69664782e-02,  2.11752623e-01, -1.14842892e-01, -2.38708079e-01,
       -4.24128771e-03,  

In [64]:
X_train = X["w2v"]
y_train = Y_st
y_lt_train = Y_lt
model = SVR(kernel= 'linear', C=1e-5)
model.fit(X_train, y_lt_train)#y_train)
y_pred = model.predict(X_test)

In [65]:
pickle.dump(model, open('me20_svr_w2v_lt_model.pickle', 'wb'))

In [66]:
y_x_train = model.predict(X_train)

In [67]:
y_pred

array([0.77997079, 0.7797642 , 0.78008795, 0.78022108, 0.77986659,
       0.77965182, 0.77994218, 0.77976779, 0.77944793, 0.78015572,
       0.78000288, 0.77973616, 0.78004828, 0.77997102, 0.78017335,
       0.77995378, 0.77985305, 0.77986139, 0.7799766 , 0.78014961,
       0.77986225, 0.77998676, 0.7801218 , 0.78008001, 0.78000766,
       0.78001682, 0.77994058, 0.77955221, 0.78009712, 0.77996546,
       0.77988701, 0.77984274, 0.77991277, 0.77988426, 0.78013506,
       0.77994658, 0.77991734, 0.77978485, 0.78012153, 0.77977709,
       0.78007222, 0.77995545, 0.77974254, 0.77996164, 0.77982887,
       0.77985802, 0.77998548, 0.77997278, 0.78002488, 0.77980092,
       0.78007699, 0.78000446, 0.77981268, 0.77994634, 0.77991927,
       0.7796855 , 0.77984967, 0.77995579, 0.77998625, 0.77990014,
       0.78004567, 0.77997777, 0.77992922, 0.77985333, 0.7800225 ,
       0.78004387, 0.78000878, 0.7800946 , 0.77989137, 0.78009467,
       0.77979589, 0.77929193, 0.77971318, 0.77941065, 0.77985

In [53]:
scaler  = MinMaxScaler()
y_range = scaler.fit_transform(y_x_train.reshape(-1, 1)) # 

In [54]:
y_range

array([[0.31803164],
       [0.44820742],
       [0.20585157],
       [0.36644582],
       [0.4579202 ],
       [0.31014022],
       [0.51386409],
       [0.64925758],
       [0.39145064],
       [0.38564609],
       [0.46151101],
       [0.44995207],
       [0.43926785],
       [0.55292696],
       [0.47636936],
       [0.4786676 ],
       [0.39763412],
       [0.41138989],
       [0.26707103],
       [0.7882852 ],
       [0.64806453],
       [0.58608948],
       [0.46933737],
       [0.61456129],
       [0.45628901],
       [0.82027662],
       [0.65242422],
       [0.04640779],
       [0.5684666 ],
       [0.52909859],
       [0.46269163],
       [0.40234362],
       [0.67198139],
       [0.80897008],
       [0.43794534],
       [0.19377595],
       [0.39657238],
       [0.25891477],
       [0.43207796],
       [0.48542031],
       [0.07088518],
       [0.18776335],
       [0.55021202],
       [0.61777229],
       [0.83209362],
       [0.8479839 ],
       [0.43676525],
       [0.338

In [60]:
scaler.transform(np.array([0.83]).reshape(1, -1))

array([[0.49061436]])

In [None]:
X_w2v   = []

for i, entry in tqdm(df_data.iterrows(), total=len(df_data)):
    text = tokenize(entry['text'])
    
    words = [word for word in text.split(' ') if word in w2v]
    x_w2v = np.zeros([300]) if not words else np.mean([w2v[word] for word in words], axis=0)
    
    X_w2v.append(x_w2v)

In [40]:
for term, all_folds in [('Short term', folds), ('Long term', folds_lt), ('Long Short term', folds_lt)]:
    print(term.upper())
    for embedding in all_folds:
        print('  USING', embedding)
        for i, model_name in enumerate(all_folds[embedding]):
            # print('    ', (model_name.split('(')[0] + ' ' + str(i+1)).ljust(18), '\t', end=' ')
            print('    ', model_name, '\t', end=' ')
            # print(', '.join([str(spearman(y_p, y_t)) for y_p, y_t in all_folds[embedding][model_name]]))
            if term == 'Long Short term':
                sps = [spearman(ys_p, yl_t) for (ys_p, ys_t), (yl_p, yl_t)
                       in zip(folds[embedding][model_name], folds_lt[embedding][model_name])]
            else:
                sps = [spearman(y_p, y_t) for y_p, y_t in all_folds[embedding][model_name]]
            print(round(sum(sps)/len(sps), 4))

SHORT TERM
  USING w2v
     SVR(C=0.001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 	 0.2424
     SVR(C=0.0001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 	 0.246
     SVR(C=1e-05, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 	 0.2506
     SVR(C=1e-07, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 	 0.2506
     SVR(C=0.001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) 	 0.2451
     SVR(C=0.0001, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001

In [43]:
ismail_st = folds['w2v']["SVR(C=1e-05, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)"]
ismail_lt = folds_lt['w2v']["SVR(C=1e-05, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)"]

# Jorma's results

In [None]:
viz_st = pd.read_csv('me_2020/me20in_memad_shorterm_a.csv', header=None)[[1]].values
viz_lt = pd.read_csv('me_2020/me20in_memad_longterm_a.csv', header=None)[[1]].values
scores_st = df_scores[['part_1_scores']].values
scores_lt = df_scores[['part_2_scores']].values

In [None]:
viz_lt.shape

In [None]:
scores_st = df_scores[['part_1_scores']].values
scores_lt = df_scores[['part_2_scores']].values

In [None]:
spearman(viz_st, scores_st)

In [None]:
spearman(scores_lt, viz_lt)

# Training on ME2019 data

In [None]:
me19 = pd.read_csv('me_2020/me19_training_data.csv')
me19.head()

In [None]:
X19_tfidf = []
X19_w2v   = []
Y19 = []

for i, entry in tqdm(me19.iterrows(), total=len(me19)):
    text = tokenize(entry['text'])
    y = (entry['short_term'], entry['long_term'])
    
    x_tfidf = vectorizer.transform([text]).toarray()[0]
    words = [word for word in text.split(' ') if word in w2v]
    x_w2v = np.zeros([300]) if not words else np.mean([w2v[word] for word in words], axis=0)
    
    assert(x_tfidf.shape == (991,))
    assert(x_w2v.shape == (300,))
    
    X19_tfidf.append(x_tfidf)
    X19_w2v.append(x_w2v)
    Y19.append(y)

In [None]:
X19_tfidf = np.array(X19_tfidf)
X19_w2v = np.array(X19_w2v)
Y19 = np.array(Y19)

X19_tfidf.shape, X19_w2v.shape, Y19.shape

In [None]:
corpus19 = [l.lower() for l in  me19['text'].values]

In [None]:
bert19_embeddings = []
for i in tqdm(range(10)):
    embeddings = model.encode(corpus19[i*800: (i+1)*800])
    bert19_embeddings.append(embeddings)

bert19_embeddings = np.concatenate(bert19_embeddings)

In [None]:
X19 = {'tfidf': X19_tfidf, 'w2b':X19_w2v, 'bert': bert19_embeddings}
Y19_st = Y19[:, 0]
Y19_lt = Y19[:, 1]

In [None]:
"""
X = {'tfidf': X_tfidf, 'w2v':X_w2v, 'bert1': bert1_embeddings, 'bert2': bert2_embeddings}
Y_st = Y[:, 0]
Y_lt = Y[:, 1]me19_st = {}
for k in X19:
    # if k == 'tfidf': continue
    print(k)
    me19_st[k] = {}
    for regressor in enumerate_models(regression_models):
        model_name = str(regressor)
        me19_st[model_name] = []
        print('Training', model_name.split('(')[0], '..')
                
        t = time.time()
        X_train, X_test = X19[k], X[k]
        y_train, y_test = Y19_st, Y_st
        
        if model_name.startswith('SVR'):
            X_train, y_train = X_train[:600], y_train[:600]
        
        # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        me19_st[k][model_name] = (y_pred, y_test)
        print()
        print(f'done! ({(time.time() - t):2} secs)')
        print('Spearman:', round(spearman(y_pred, y_test), 3), '\n')
        t = time.time()
        
me19_lt = {}
for k in X19:
    # if k == 'tfidf': continue
    print(k)
    me19_lt[k] = {}
    for regressor in enumerate_models(regression_models):
        model_name = str(regressor)
        me19_st[model_name] = []
        print('Training', model_name.split('(')[0], '..')
        t = time.time()
        X_train, X_test = X19[k], X[k]
        y_train, y_test = Y19_lt, Y_lt
        
        # print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        me19_lt[k][model_name] = (y_pred, y_test)
        print()
        print(f'done! ({(time.time() - t):2} secs)')
        print('Spearman:', round(spearman(y_pred, y_test), 3), '\n')
        t = time.time()
"""

# Using the new devset

In [51]:
df_text_devdf_text_dev = pd.read_csv('me_2020/dev_text_descriptions.csv', )
df_scores_dev = pd.read_csv('me_2020/dev_scores.csv', )

In [52]:
df_text_dev.head()

Unnamed: 0,video_id,video_url,description
0,34,https://mtc.cdn.vine.co/r/videos_h264high/ABE6...,a girl is screaming and punching a man in a ha...
1,34,https://mtc.cdn.vine.co/r/videos_h264high/ABE6...,two girls getting frightened by big man in a s...
2,57,https://mtc.cdn.vine.co/r/videos_h264high/BEE8...,a person takes a cap of another person away
3,57,https://mtc.cdn.vine.co/r/videos_h264high/BEE8...,a man kicks and boxes another one slightly in ...
4,68,https://mtc.cdn.vine.co/r/videos_h264high/C65D...,a group of people playing volleyball in a spor...


In [53]:
df_scores_dev.head()

Unnamed: 0,video_id,video_url,ann_1,ann_2,part_1_scores,part_2_scores
0,34,https://mtc.cdn.vine.co/r/videos_h264high/ABE6...,13,7,0.92,0.86
1,57,https://mtc.cdn.vine.co/r/videos_h264high/BEE8...,12,9,1.0,0.78
2,68,https://mtc.cdn.vine.co/r/videos_h264high/C65D...,11,6,0.91,0.33
3,79,https://mtc.cdn.vine.co/r/videos_h264high/CF30...,12,8,0.92,0.88
4,81,https://mtc.cdn.vine.co/r/videos_h264high/CFC2...,16,6,0.94,1.0


In [56]:
dev_text_concat = df_text_dev[['video_id','description']].groupby(['video_id'])['description'].transform(lambda x: '  '.join(x)).drop_duplicates()

In [58]:
len(dev_text_concat)

410

In [63]:
df_dev_data = df_scores_dev[['video_id', 'part_1_scores', 'part_2_scores']].rename(columns={'part_1_scores':'short_term', 'part_2_scores':'long_term'})
df_dev_data['text'] = dev_text_concat.values
df_dev_data.head()

Unnamed: 0,video_id,short_term,long_term,text
0,34,0.92,0.86,a girl is screaming and punching a man in a ha...
1,57,1.00,0.78,a person takes a cap of another person away a...
2,68,0.91,0.33,a group of people playing volleyball in a spor...
3,79,0.92,0.88,a black woman punches a black man after a dia...
4,81,0.94,1.00,a woman plays with a hula hoop outside at dayt...
5,92,0.83,1.00,a man in a park lights a bunch of leaves a ma...
6,95,0.86,0.67,a woman is shown in different scenes woman si...
7,107,1.00,0.60,a person smashing a tv in a living room an ol...
8,111,0.91,1.00,a man wearing glasses a man says something in...
9,114,0.91,0.71,a child gets tackled by a goat at daytime sma...


In [64]:
X_dev_tfidf = []
X_dev_w2v   = []
Y_dev = []

for i, entry in tqdm(df_dev_data.iterrows(), total=len(df_dev_data)):
    text = tokenize(entry['text'])
    y = (entry['short_term'], entry['long_term'])
    
    x_tfidf = vectorizer.transform([text]).toarray()[0]
    words = [word for word in text.split(' ') if word in w2v]
    x_w2v = np.zeros([300]) if not words else np.mean([w2v[word] for word in words], axis=0)
    
    X_dev_tfidf.append(x_tfidf)
    X_dev_w2v.append(x_w2v)
    Y_dev.append(y)

HBox(children=(FloatProgress(value=0.0, max=410.0), HTML(value='')))




In [49]:
X.keys(), Y_st.shape, Y_lt.shape

(dict_keys(['tfidf', 'w2v', 'bert1', 'bert2']), (590,), (590,))

In [65]:
dev_corpus_tokenized2 = [tokenize2(s) for s in df_dev_data.text.values]

In [66]:
bert1_dev_embeddings = sbert1.encode(dev_corpus_tokenized2)
bert2_dev_embeddings = sbert2.encode(dev_corpus_tokenized2)

In [68]:
X_dev_tfidf = np.array(X_dev_tfidf)
X_dev_w2v   = np.array(X_dev_w2v)
Y_dev = np.array(Y_dev)

In [69]:
X_dev = {'tfidf':X_dev_tfidf , 'w2v':X_dev_w2v, 'bert1':bert1_dev_embeddings, 'bert2':bert2_dev_embeddings}
Y_dev_st = Y_dev[:, 0]
Y_dev_lt = Y_dev[:, 1]

In [75]:
dev_preds = {}
dev_results = {}
for term in ['Short term', 'Long term']:
    dev_preds[term] = {}
    dev_results[term] = {}
    
    for embedding in X.keys():
        dev_preds[term][embedding] = {}
        dev_results[term][embedding] = {}
        for regressor in enumerate_models(regression_models):
            model_name = str(regressor)
            print('Training', model_name.split(',')[0] + ')' , 'on', embedding, 'for', term, '..')
            
            t = time.time()
            
            X_train, X_test = X[embedding], X_dev[embedding]
            y_train = Y_st if 'Short' in term else Y_lt
            regressor.fit(X_train, y_train)
            y_pred = regressor.predict(X_test)
            dev_preds[term][embedding][model_name] = (y_pred)
            dev_results[term][embedding][model_name] = {
                'on_st': spearman(y_pred, Y_dev_st),
                'on_lt': spearman(y_pred, Y_dev_lt)
            }
            # print(f'done! ({(time.time() - t):.2} secs)')
            print(f'Spearman: ST {spearman(y_pred, Y_dev_st):.3}, / LT {spearman(y_pred, Y_dev_lt):.3}')

            t = time.time()

Training SVR(C=1.0) on tfidf for Short term ..
Spearman: ST -0.0134, / LT 0.0564
Training SVR(C=0.001) on tfidf for Short term ..
Spearman: ST -0.007, / LT 0.0911
Training SVR(C=1e-05) on tfidf for Short term ..
Spearman: ST -0.00393, / LT 0.0937
Training SVR(C=1.0) on tfidf for Short term ..
Spearman: ST -0.0279, / LT 0.0957
Training SVR(C=0.001) on tfidf for Short term ..
Spearman: ST -0.00668, / LT 0.0974
Training SVR(C=1e-05) on tfidf for Short term ..
Spearman: ST -0.00668, / LT 0.0974
Training SVR(C=1.0) on w2v for Short term ..
Spearman: ST -0.0355, / LT 0.0353
Training SVR(C=0.001) on w2v for Short term ..
Spearman: ST -0.032, / LT 0.0639
Training SVR(C=1e-05) on w2v for Short term ..
Spearman: ST -0.0333, / LT 0.0672
Training SVR(C=1.0) on w2v for Short term ..
Spearman: ST -0.0305, / LT 0.0937
Training SVR(C=0.001) on w2v for Short term ..
Spearman: ST -0.0275, / LT 0.0595
Training SVR(C=1e-05) on w2v for Short term ..
Spearman: ST -0.0271, / LT 0.0637
Training SVR(C=1.0) on 

In [None]:
for term, all_folds in ['Short term', 'Long term', 'Long Short term']:
    print(term.upper())
    for embedding in all_folds:
        print('  USING', embedding)
        for i, model_name in enumerate(all_folds[embedding]):
            print('    ', (model_name.split('(')[0] + ' ' + str(i+1)).ljust(18), '\t', end=' ')
            # print(', '.join([str(spearman(y_p, y_t)) for y_p, y_t in all_folds[embedding][model_name]]))
            if term == 'Long Short term':
                sps = [spearman(ys_p, yl_t) for (ys_p, ys_t), (yl_p, yl_t)
                       in zip(folds[embedding][model_name], folds_lt[embedding][model_name])]
            else:
                sps = [spearman(y_p, y_t) for y_p, y_t in all_folds[embedding][model_name]]
            print(round(sum(sps)/len(sps), 4))

# Combining everything

In [51]:
# Ground Truth
scores_st = df_scores['part_1_scores'].values.tolist()
scores_lt = df_scores['part_2_scores'].values.tolist()

NameError: name 'df_scores' is not defined

In [52]:
# Ismail's results
best_model = "SVR(C=1e-05, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',\n    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)"
ismail_st = [v for fold in folds['w2v'][best_model] for v in fold[0]]
ismail_lt = [v for fold in folds_lt['w2v'][best_model] for v in fold[0]]

NameError: name 'folds' is not defined

In [53]:
# Alison's results
alison_st_pkl = pickle.load(open('me_2020/6folds_st.pkl', 'rb'))
alison_lt_pkl = pickle.load(open('me_2020/6folds_lt.pkl', 'rb'))
alison_st = [v for fold in alison_st_pkl for v in fold[0]]
alison_lt = [v for fold in alison_lt_pkl for v in fold[0]]

In [54]:
# Jorma's results
jorma_st_df = pd.read_csv('me_2020/short_i3d+audio_80_750.csv', header=None)
jorma_lt_df = pd.read_csv('me_2020/long_i3d+audio_260_160.csv', header=None)
jorma_st = jorma_st_df[1].values.tolist()[:590]
jorma_lt = jorma_lt_df[1].values.tolist()[:590]

In [280]:
len(jorma_st_df)

1090

In [281]:
jorma_st_df

Unnamed: 0,0,1
0,8,0.823957
1,26,0.830174
2,33,0.754513
3,46,0.795868
4,64,0.812515
5,70,0.817742
6,74,0.863829
7,117,0.858145
8,139,0.859625
9,143,0.845034


In [182]:
combined_df = pd.DataFrame({'gt_st': scores_st,
                            'gt_lt': scores_lt,
                            'ismail_st': ismail_st,
                            'ismail_lt': ismail_lt,
                            'alison_st': alison_st,
                            'alison_lt': alison_lt,
                            'jorma_st': jorma_st,
                            'jorma_lt': jorma_lt,
                           })

In [184]:
combined_df.head()

Unnamed: 0,gt_st,gt_lt,ismail_st,ismail_lt,alison_st,alison_lt,jorma_st,jorma_lt
0,0.75,0.57,0.829869,0.779799,0.828981,0.775661,0.823957,0.737312
1,0.87,0.43,0.829948,0.780015,0.828585,0.776517,0.830174,0.779736
2,0.69,0.75,0.829665,0.779347,0.828457,0.774645,0.754513,0.735032
3,0.87,0.57,0.829922,0.779882,0.828381,0.776055,0.795868,0.753985
4,0.84,0.56,0.829948,0.780026,0.82911,0.775538,0.812515,0.7424


In [326]:
combined_df['video_id'] = df_scores['video_id']
combined_df.head()

Unnamed: 0,gt_st,gt_lt,ismail_st,ismail_lt,alison_st,alison_lt,jorma_st,jorma_lt,video_id
0,0.75,0.57,0.829869,0.779799,0.828981,0.775661,0.823957,0.737312,8
1,0.87,0.43,0.829948,0.780015,0.828585,0.776517,0.830174,0.779736,26
2,0.69,0.75,0.829665,0.779347,0.828457,0.774645,0.754513,0.735032,33
3,0.87,0.57,0.829922,0.779882,0.828381,0.776055,0.795868,0.753985,46
4,0.84,0.56,0.829948,0.780026,0.82911,0.775538,0.812515,0.7424,64


In [72]:
# combined_df.to_csv('me_2020/all_predictions_trainset.csv')
combined_df = pd.read_csv('me_2020/all_predictions_trainset.csv')

In [51]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [74]:
n = 8
ast =  combined_df['alison_st'].round(n)
ist =  combined_df['ismail_st'].round(n)
jst =  combined_df['jorma_st'].round(n)
alt =  combined_df['alison_lt'].round(n)
ilt =  combined_df['ismail_lt'].round(n)
jlt =  combined_df['jorma_lt'].round(n)

Unnamed: 0.1,Unnamed: 0,gt_st,gt_lt,ismail_st,ismail_lt,alison_st,alison_lt,jorma_st,jorma_lt,video_id
Unnamed: 0,1.0,-0.018153,0.050582,0.178238,-0.130942,0.097963,-0.074953,0.101225,0.02988,0.999292
gt_st,-0.018153,1.0,0.175425,0.196415,0.164939,0.181346,0.164563,0.265874,0.208154,-0.017932
gt_lt,0.050582,0.175425,1.0,0.124925,0.080175,0.100459,0.081669,0.176748,0.187687,0.050864
ismail_st,0.178238,0.196415,0.124925,1.0,0.46938,0.371026,0.092414,0.310438,0.323661,0.17641
ismail_lt,-0.130942,0.164939,0.080175,0.46938,1.0,0.215378,0.414216,0.348257,0.338621,-0.12827
alison_st,0.097963,0.181346,0.100459,0.371026,0.215378,1.0,0.39011,0.310084,0.249478,0.098228
alison_lt,-0.074953,0.164563,0.081669,0.092414,0.414216,0.39011,1.0,0.172048,0.198851,-0.067357
jorma_st,0.101225,0.265874,0.176748,0.310438,0.348257,0.310084,0.172048,1.0,0.668734,0.099026
jorma_lt,0.02988,0.208154,0.187687,0.323661,0.338621,0.249478,0.198851,0.668734,1.0,0.026183
video_id,0.999292,-0.017932,0.050864,0.17641,-0.12827,0.098228,-0.067357,0.099026,0.026183,1.0


In [125]:
ast_scaler = MinMaxScaler()
ist_scaler = MinMaxScaler()
jst_scaler = MinMaxScaler()
alt_scaler = MinMaxScaler()
ilt_scaler = MinMaxScaler()
jlt_scaler = MinMaxScaler()

In [126]:
ast = ast_scaler.fit_transform(combined_df['alison_st'].values.reshape(-1, 1))
ist = ist_scaler.fit_transform(combined_df['ismail_st'].values.reshape(-1, 1))
jst = jst_scaler.fit_transform(combined_df['jorma_st'].values.reshape(-1, 1))
alt = alt_scaler.fit_transform(combined_df['alison_lt'].values.reshape(-1, 1))
ilt = ilt_scaler.fit_transform(combined_df['ismail_lt'].values.reshape(-1, 1))
jlt = jlt_scaler.fit_transform(combined_df['jorma_lt'].values.reshape(-1, 1))

In [127]:
increment = 0.01
steps = int(1 / increment)
print("Steps:", steps)

best_combo = None
best_score = 0

for a in tqdm(range(steps+1)):
    for b in range(steps+1 - a):
        c = steps - (a + b)
        al = a / steps
        bl = b / steps
        cl = c / steps
        
        # print(al, bl, cl)
        score = spearman(al * ast + bl * ist + cl * jst, combined_df['gt_st'])
        if score > best_score:
            best_score = score
            best_combo = al, bl, cl
            # print('New best combo', best_combo, ':\t', best_score)
            print('Best combo', best_combo, ':\t', best_score)

Steps: 100


HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))

Best combo (0.0, 0.0, 1.0) :	 0.2750845878773936
Best combo (0.0, 0.01, 0.99) :	 0.2763279119733419
Best combo (0.0, 0.02, 0.98) :	 0.2777200746105133
Best combo (0.0, 0.03, 0.97) :	 0.2793903189571508
Best combo (0.0, 0.04, 0.96) :	 0.28071831114097084
Best combo (0.0, 0.05, 0.95) :	 0.28194129181297406
Best combo (0.0, 0.06, 0.94) :	 0.28338581403190044
Best combo (0.0, 0.07, 0.93) :	 0.28508690112490953
Best combo (0.0, 0.08, 0.92) :	 0.2858367745319537
Best combo (0.0, 0.09, 0.91) :	 0.28637399378060274
Best combo (0.0, 0.1, 0.9) :	 0.2875367507013969
Best combo (0.0, 0.11, 0.89) :	 0.2894406105800301
Best combo (0.0, 0.12, 0.88) :	 0.29046023376979546
Best combo (0.0, 0.13, 0.87) :	 0.29102575233454653
Best combo (0.0, 0.14, 0.86) :	 0.2926837180522967
Best combo (0.0, 0.15, 0.85) :	 0.2930227543837389
Best combo (0.0, 0.16, 0.84) :	 0.2939182172475894
Best combo (0.0, 0.17, 0.83) :	 0.2948366002186961
Best combo (0.0, 0.18, 0.82) :	 0.2956165270316835
Best combo (0.0, 0.19, 0.81)

In [128]:
increment = 0.01
steps = int(1 / increment)
print("Steps:", steps)

best_combo = None
best_score = 0

for a in tqdm(range(steps+1)):
    for b in range(steps+1 - a):
        c = steps - (a + b)
        al = a / steps
        bl = b / steps
        cl = c / steps
        
        # print(al, bl, cl)
        score = spearman(al * alt + bl * ilt + cl * jlt, combined_df['gt_lt'])
        if score > best_score:
            best_score = score
            best_combo = al, bl, cl
            # print('New best combo', best_combo, ':\t', best_score)
            print('Best combo', best_combo, ':\t', best_score)

Steps: 100


HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))

Best combo (0.0, 0.0, 1.0) :	 0.16402776554706433
Best combo (0.0, 0.01, 0.99) :	 0.16437446235122308
Best combo (0.0, 0.02, 0.98) :	 0.16456660898130665
Best combo (0.0, 0.03, 0.97) :	 0.16464781380711577
Best combo (0.0, 0.04, 0.96) :	 0.1649420749924422
Best combo (0.0, 0.05, 0.95) :	 0.16527047211754534
Best combo (0.0, 0.06, 0.94) :	 0.16570400441799185
Best combo (0.0, 0.07, 0.93) :	 0.16624765738326802
Best combo (0.0, 0.08, 0.92) :	 0.16653103847095085
Best combo (0.0, 0.09, 0.91) :	 0.166899818060893
Best combo (0.0, 0.1, 0.9) :	 0.1672761638642908
Best combo (0.0, 0.11, 0.89) :	 0.16803970624232137
Best combo (0.0, 0.12, 0.88) :	 0.16878025671589747
Best combo (0.0, 0.13, 0.87) :	 0.16922314414073297
Best combo (0.0, 0.14, 0.86) :	 0.16975714871753272
Best combo (0.01, 0.13, 0.86) :	 0.16984002514864022
Best combo (0.01, 0.14, 0.85) :	 0.16996768300589812
Best combo (0.01, 0.16, 0.83) :	 0.17002536805189541



0.07825951592626414

In [130]:
increment = 0.01
steps = int(1 / increment)
print("Steps:", steps)

best_combo = None
best_score = 0

for a in tqdm(range(steps+1)):
    for b in range(steps+1 - a):
        c = steps - (a + b)
        al = a / steps
        bl = b / steps
        cl = c / steps
        
        # print(al, bl, cl)
        score = spearman(al * ast + bl * ist + cl * jst, combined_df['gt_lt'])
        if score > best_score:
            best_score = score
            best_combo = al, bl, cl
            # print('New best combo', best_combo, ':\t', best_score)
            print('Best combo', best_combo, ':\t', best_score)

Steps: 100


HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))

Best combo (0.0, 0.0, 1.0) :	 0.17828660533583537
Best combo (0.0, 0.01, 0.99) :	 0.17915869681156274
Best combo (0.0, 0.02, 0.98) :	 0.17993253275906204
Best combo (0.0, 0.03, 0.97) :	 0.18120535755132205
Best combo (0.0, 0.04, 0.96) :	 0.1817569872997763
Best combo (0.0, 0.05, 0.95) :	 0.1830555606789126
Best combo (0.0, 0.06, 0.94) :	 0.18352921150651944
Best combo (0.0, 0.07, 0.93) :	 0.18450214444297436
Best combo (0.0, 0.08, 0.92) :	 0.18515914397804192
Best combo (0.0, 0.09, 0.91) :	 0.18596846488012014
Best combo (0.0, 0.1, 0.9) :	 0.18686168863761185
Best combo (0.0, 0.11, 0.89) :	 0.18810809033559453
Best combo (0.0, 0.12, 0.88) :	 0.1894265470198585
Best combo (0.0, 0.13, 0.87) :	 0.19017346132413182
Best combo (0.0, 0.14, 0.86) :	 0.19047910115605046
Best combo (0.0, 0.15, 0.85) :	 0.19134942098831398
Best combo (0.0, 0.16, 0.84) :	 0.1917955636445199
Best combo (0.0, 0.17, 0.83) :	 0.19265253996080542
Best combo (0.0, 0.18, 0.82) :	 0.19319267375703245
Best combo (0.0, 0.1

# Testset results

In [289]:
df_text_test = pd.read_csv('me_2020/test_text_descriptions.csv', )

In [290]:
df_text_test.head()

Unnamed: 0,video_id,video_url,description
0,4,https://mtc.cdn.vine.co/r/videos_h264high/9D3F...,people are going downstairs on a stage
1,4,https://mtc.cdn.vine.co/r/videos_h264high/9D3F...,several people walking down a staircase on a s...
2,7,https://mtc.cdn.vine.co/r/videos_h264high/9E23...,a man is scared by a picture of a boy on a mir...
3,7,https://mtc.cdn.vine.co/r/videos_h264high/9E23...,white male taking a selfie in a bathroom mirror
4,15,https://mtc.cdn.vine.co/r/videos_h264high/A20C...,two chinese men speaks and laugh at home


In [292]:
test_text_concat = df_text_test[['video_id','description']].groupby(['video_id'])['description'].transform(lambda x: '  '.join(x)).drop_duplicates()

print(len(test_text_concat))

df_test = pd.DataFrame({"video_id": df_text_test.video_id.unique(),
                        "text": test_text_concat,})
df_test.head()

500


Unnamed: 0,video_id,text
0,4,people are going downstairs on a stage severa...
2,7,a man is scared by a picture of a boy on a mir...
4,15,two chinese men speaks and laugh at home in a...
6,41,a dog gets whipped cream into his face indoors...
8,47,a women takes a piece of cloth from a man and ...


In [293]:
len(df_test)

500

In [288]:
len(df_text_test)

1790

In [295]:
X_test_tfidf = []
X_test_w2v   = []

for i, entry in tqdm(df_test.iterrows(), total=len(df_test)):
    text = tokenize(entry['text'])
    
    x_tfidf = vectorizer.transform([text]).toarray()[0]
    words = [word for word in text.split(' ') if word in w2v]
    x_w2v = np.zeros([300]) if not words else np.mean([w2v[word] for word in words], axis=0)
    
    X_test_tfidf.append(x_tfidf)
    X_test_w2v.append(x_w2v)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [304]:
X_train = X['w2v']
y_train = Y_st
svr_model = SVR(C=1e-05, gamma='scale', kernel='linear')
svr_model.fit(X_train, y_train)
ismail_st_test = svr_model.predict(X_test_w2v)

In [305]:
ismail_st_test.shape

(500,)

In [308]:
X_train = X['w2v']
y_train = Y_lt
svr_model_lt = SVR(C=1e-05, gamma='scale', kernel='linear')
svr_model_lt.fit(X_train, y_train)
ismail_lt_test = svr_model_lt.predict(X_test_w2v)

In [309]:
ismail_preds = pd.DataFrame({'video_id': df_text_test.video_id.unique(),
                             'short_term': ismail_st_test,
                             'long_term': ismail_lt_test})

In [310]:
ismail_preds.head()

Unnamed: 0,video_id,short_term,long_term
0,4,0.829919,0.779846
1,7,0.830095,0.780101
2,15,0.829826,0.779656
3,41,0.830275,0.780135
4,47,0.830024,0.780146


In [48]:
# ismail_preds.to_csv('me_2020/ismail_testset_preds_svr_1e-05_scale_lin.csv')
ismail_preds = pd.read_csv('me_2020/ismail_testset_preds_svr_1e-05_scale_lin.csv')

In [50]:
alison_st_test = pd.read_csv('me_2020/st_test_pred_vilbert.csv', names=['id', 'preds'])['preds'].values[1:]
alison_lt_test = pd.read_csv('me_2020/lt_test_pred_vilbert.csv', names=['id', 'preds'])['preds'].values[1:]
len(alison_st_test), len(alison_lt_test)

(500, 500)

In [329]:
jorma_st_test = jorma_st_df[1].values.tolist()[590:]
jorma_lt_test = jorma_lt_df[1].values.tolist()[590:]

In [102]:
all_test.head()

Unnamed: 0.1,Unnamed: 0,video_id,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt,confidence
0,0,4,0.828293,0.775897,0.829919,0.779846,0.829443,0.750538,1.0
1,1,7,0.829522,0.777468,0.830095,0.780101,0.816488,0.731488,1.0
2,2,15,0.830905,0.782404,0.829826,0.779656,0.812489,0.775952,1.0
3,3,41,0.829813,0.777769,0.830275,0.780135,0.832075,0.774321,1.0
4,4,47,0.828998,0.782688,0.830024,0.780146,0.816765,0.722495,1.0


In [338]:
all_test['confidence'] = 1.

In [339]:
# all_test.to_csv('me_2020/all_predictions_testset.csv')

In [56]:
all_train = pd.read_csv('me_2020/all_predictions_trainset.csv')
all_test = pd.read_csv('me_2020/all_predictions_testset.csv')

In [57]:
all_test.head()

Unnamed: 0.1,Unnamed: 0,video_id,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt,confidence
0,0,4,0.828293,0.775897,0.829919,0.779846,0.829443,0.750538,1.0
1,1,7,0.829522,0.777468,0.830095,0.780101,0.816488,0.731488,1.0
2,2,15,0.830905,0.782404,0.829826,0.779656,0.812489,0.775952,1.0
3,3,41,0.829813,0.777769,0.830275,0.780135,0.832075,0.774321,1.0
4,4,47,0.828998,0.782688,0.830024,0.780146,0.816765,0.722495,1.0


In [58]:
all_test[['alison_st', 'ismail_st', 'jorma_st']].describe()

Unnamed: 0,alison_st,ismail_st,jorma_st
count,500.0,500.0,500.0
mean,0.829356,0.829968,0.836334
std,0.001511,0.000196,0.038389
min,0.825885,0.829234,0.676143
25%,0.828123,0.829872,0.813927
50%,0.829482,0.82998,0.8395
75%,0.830617,0.830091,0.862124
max,0.832782,0.830522,0.948258


In [91]:
x = jlt_scaler.transform(all_test['jorma_st'].values.reshape(-1, 1)).reshape(500,)
x.shape

(500,)

In [123]:
all_test['alison_st'].values[:10]

array([0.82829345, 0.82952207, 0.83090526, 0.82981295, 0.82899799,
       0.82786299, 0.82796961, 0.82849147, 0.8295096 , 0.82921436])

In [124]:
ast_scaler.transform(all_test['alison_st'].values.reshape(-1, 1)).reshape(500,)[:10]

array([-0.85288662,  0.08311114,  1.13687325,  0.30471364, -0.31614122,
       -1.18082536, -1.09959335, -0.7020289 ,  0.07361582, -0.15130871])

In [131]:
all_test_std = pd.DataFrame({'video_id': all_test['video_id'].values,
                             'alison_st': ast_scaler.transform(all_test['alison_st'].values.reshape(-1, 1)).reshape(500,),
                             'alison_lt': alt_scaler.transform(all_test['alison_lt'].values.reshape(-1, 1)).reshape(500,),
                             'ismail_st': ist_scaler.transform(all_test['ismail_st'].values.reshape(-1, 1)).reshape(500,),
                             'ismail_lt': ilt_scaler.transform(all_test['ismail_lt'].values.reshape(-1, 1)).reshape(500,),
                             'jorma_st': jst_scaler.transform(all_test['jorma_st'].values.reshape(-1, 1)).reshape(500,),
                             'jorma_lt': jlt_scaler.transform(all_test['jorma_lt'].values.reshape(-1, 1)).reshape(500,),
                            })

In [132]:
all_test_std.head(20)

Unnamed: 0,video_id,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt
0,4,0.383458,0.272435,0.409486,0.537078,0.593201,0.397143
1,7,0.563437,0.371407,0.559014,0.707226,0.543162,0.325171
2,15,0.76606,0.682299,0.329921,0.410464,0.527715,0.493156
3,41,0.606048,0.390359,0.71206,0.730097,0.603371,0.486996
4,47,0.486667,0.700191,0.498399,0.737386,0.544231,0.291194
5,63,0.3204,0.184297,0.282404,0.504865,0.556138,0.4806
6,67,0.33602,0.298659,0.498977,0.558013,0.716893,0.602573
7,84,0.412466,0.493821,0.538602,0.729125,0.53328,0.441799
8,89,0.561611,0.320613,0.464004,0.708996,0.724692,0.488793
9,100,0.518362,0.304305,0.326833,0.507233,0.74191,0.628844


In [133]:
all_test_std[['alison_st', 'alison_lt', 'ismail_st', 'ismail_lt', 'jorma_st', 'jorma_lt']].describe()

Unnamed: 0,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.539118,0.517795,0.451211,0.568938,0.619819,0.48232
std,0.221405,0.194048,0.166582,0.160481,0.148281,0.153383
min,0.030583,0.169402,-0.17342,-0.076967,0.001066,-0.051344
25%,0.358507,0.37068,0.369494,0.512454,0.53327,0.382342
50%,0.557606,0.48738,0.461313,0.60192,0.63205,0.482947
75%,0.723819,0.653656,0.555243,0.667008,0.719435,0.590275
max,1.040987,1.048169,0.922267,0.924241,1.052138,0.939776


In [134]:
new_df_train = pd.DataFrame({'alison_st': ast.reshape(590,), 
                             'alison_lt': alt.reshape(590,), 
                             'ismail_st': ist.reshape(590,), 
                             'ismail_lt': ilt.reshape(590,), 
                             'jorma_st': jst.reshape(590,), 
                             'jorma_lt': jlt.reshape(590,)}, index=range(590))

In [135]:
new_df_train.describe()

Unnamed: 0,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt
count,590.0,590.0,590.0,590.0,590.0,590.0
mean,0.547456,0.467092,0.494136,0.584043,0.614335,0.465426
std,0.192449,0.198578,0.164735,0.142067,0.14656,0.145031
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.410402,0.310987,0.395368,0.517451,0.527969,0.370498
50%,0.551185,0.441225,0.484876,0.59811,0.619112,0.46339
75%,0.692146,0.612593,0.59496,0.668851,0.718559,0.566693
max,1.0,1.0,1.0,1.0,1.0,1.0


In [136]:
new_df_train.head()

Unnamed: 0,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt
0,0.484128,0.257557,0.366443,0.505485,0.572011,0.347173
1,0.426101,0.31147,0.43344,0.649638,0.596026,0.507453
2,0.407488,0.193549,0.193398,0.204572,0.303779,0.338558
3,0.396339,0.282407,0.411729,0.561456,0.463517,0.410164
4,0.503129,0.249826,0.433398,0.65686,0.527815,0.366396


In [137]:
all_test_std.head()

Unnamed: 0,video_id,alison_st,alison_lt,ismail_st,ismail_lt,jorma_st,jorma_lt
0,4,0.383458,0.272435,0.409486,0.537078,0.593201,0.397143
1,7,0.563437,0.371407,0.559014,0.707226,0.543162,0.325171
2,15,0.76606,0.682299,0.329921,0.410464,0.527715,0.493156
3,41,0.606048,0.390359,0.71206,0.730097,0.603371,0.486996
4,47,0.486667,0.700191,0.498399,0.737386,0.544231,0.291194


In [173]:
all_test['run_1_st'] = all_test['jorma_st']
all_test[['video_id', 'run_1_st', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_shorterm_run1-audiovisual.csv', header=False,index=False)
all_test['run_1_lt'] = all_test['jorma_st']
all_test[['video_id', 'run_1_lt', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_longterm_run1-audiovisual.csv', header=False,index=False)

In [174]:
all_test['run_2_st'] = all_test['alison_st']
all_test[['video_id', 'run_2_st', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_shorterm_run2-vilbert.csv', header=False,index=False)
all_test['run_2_lt'] = all_test['alison_st']
all_test[['video_id', 'run_2_lt', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_longterm_run2-vilbert.csv', header=False,index=False)

In [175]:
all_test['run_3_st'] = all_test['ismail_st']
all_test[['video_id', 'run_3_st', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_shorterm_run3-text.csv', header=False,index=False)
all_test['run_3_lt'] = all_test['ismail_st']
all_test[['video_id', 'run_3_lt', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_longterm_run3-text.csv', header=False,index=False)

In [176]:
all_test['run_4_st'] = 0.15 * all_test['alison_st'] + 0.3 * all_test['ismail_st'] + 0.55 * all_test['jorma_st']
all_test[['video_id', 'run_4_st', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_shorterm_run4-all-SLT.csv', header=False,index=False)
all_test['run_4_lt'] = 0.0 * all_test['alison_st'] + 0.4 * all_test['ismail_st'] + 0.6 * all_test['jorma_st']
all_test[['video_id', 'run_4_lt', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_longterm_run4-all-SLT.csv', header=False,index=False)

In [178]:
all_test['run_5_st'] = 0.15 * all_test['alison_st'] + 0.3 * all_test['ismail_st'] + 0.55 * all_test['jorma_st']
all_test[['video_id', 'run_5_st', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_shorterm_run5-all.csv', header=False,index=False)
all_test['run_5_lt'] = 0.0 * all_test['alison_lt'] + 0.15 * all_test['ismail_lt'] + 0.85 * all_test['jorma_lt']
all_test[['video_id', 'run_5_lt', 'confidence']].to_csv('me_2020/submissions/me20mem_memad_longterm_run5-all.csv', header=False,index=False)

In [170]:
spearman(0.0 * ast + 0.4 * ist + 0.6 * jst, combined_df['gt_lt'])

0.20383164966796977