# Importing libraries

In [2]:
import pandas as pd
import nltk
import numpy as np
import joblib

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
''''from nltk.stem.lancaster import LancasterStemmer'''
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
''''from sklearn.feature_extraction.text import CountVectorizer'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt

In [25]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

# Reading ground truth  and caption dataset for test 

In [26]:
grouth_truth_temp=pd.read_csv('ground_truth_template.csv')
read_caption= pd.read_csv('test_set_video-captions.csv')


# Merging Both the datasets

In [27]:
test_ground_truth_caption = pd.merge(grouth_truth_temp, read_caption, on = 'video')
test_ground_truth_caption.head(10)

Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations,captions
0,7494,,33,,12,green-jeep-struggling-to-drive-over-huge-rocks
1,7495,,34,,10,hiking-woman-tourist-is-walking-forward-in-mou...
2,7496,,32,,13,close-up-of-african-american-doctors-hands-usi...
3,7497,,33,,10,slow-motion-of-a-man-using-treadmill-in-the-gy...
4,7498,,33,,10,slow-motion-of-photographer-in-national-park
5,7499,,33,,13,group-of-mixed-race-american-patriotic-peoples...
6,7500,,33,,10,business-people-train-and-draw-diagrams-on-boa...
7,7502,,43,,15,father-and-daughters-smiling
8,7503,,33,,11,mechanic-using-a-rotary-polisher-on-a-the-pain...
9,7504,,33,,15,young-couple-having-a-conversation-in-hotel-co...


# Function for Part of speech Tagging

In [28]:
lemmatizer = WordNetLemmatizer() 
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

foot


# Cleaning captions through tokenization, stopwords removal, lemmatization and POS

In [30]:
cleaned_test_caption=[]

for index,row in test_ground_truth_caption.iterrows():
    tokens=(row['captions'].split('-'))
    stopWords = set(stopwords.words('english'))
    rm_stopwords= [w for w in tokens if not w in stopWords]
    ''''lanste = LancasterStemmer()
    stemming = [lanste.stem(i) for i in rm_stopwords]'''
    lemmatization=' '.join([lemmatizer.lemmatize(word , get_wordnet_pos(word)) for word in rm_stopwords if word])
    cleaned_test_caption.append(lemmatization)

test_ground_truth_caption['cleaned_test_caption']=pd.Series(cleaned_test_caption)
test_ground_truth_caption.head(10)

Unnamed: 0,video,short-term_memorability,nb_short-term_annotations,long-term_memorability,nb_long-term_annotations,captions,cleaned_test_caption
0,7494,,33,,12,green-jeep-struggling-to-drive-over-huge-rocks,green jeep struggle drive huge rock
1,7495,,34,,10,hiking-woman-tourist-is-walking-forward-in-mou...,hike woman tourist walk forward mountain sunse...
2,7496,,32,,13,close-up-of-african-american-doctors-hands-usi...,close african american doctor hand use sphygmo...
3,7497,,33,,10,slow-motion-of-a-man-using-treadmill-in-the-gy...,slow motion man use treadmill gym regular phys...
4,7498,,33,,10,slow-motion-of-photographer-in-national-park,slow motion photographer national park
5,7499,,33,,13,group-of-mixed-race-american-patriotic-peoples...,group mixed race american patriotic people ame...
6,7500,,33,,10,business-people-train-and-draw-diagrams-on-boa...,business people train draw diagram board learn...
7,7502,,43,,15,father-and-daughters-smiling,father daughter smile
8,7503,,33,,11,mechanic-using-a-rotary-polisher-on-a-the-pain...,mechanic use rotary polisher paintwork black c...
9,7504,,33,,15,young-couple-having-a-conversation-in-hotel-co...,young couple conversation hotel corridor


# Applying TF-IDF Vectorizer

In [31]:
vectorizer=TfidfVectorizer(max_features=1000)
response=vectorizer.fit_transform(test_ground_truth_caption.cleaned_test_caption).toarray()
print(len(vectorizer.get_feature_names()))
print(response)

1000
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Merging Vectorized caption in the ground truth

In [32]:
test_ground_truth_caption['Vectorized_Caption']=pd.Series(response.tolist())
print(test_ground_truth_caption.head(10))

   video  short-term_memorability  nb_short-term_annotations  \
0   7494                      NaN                         33   
1   7495                      NaN                         34   
2   7496                      NaN                         32   
3   7497                      NaN                         33   
4   7498                      NaN                         33   
5   7499                      NaN                         33   
6   7500                      NaN                         33   
7   7502                      NaN                         43   
8   7503                      NaN                         33   
9   7504                      NaN                         33   

   long-term_memorability  nb_long-term_annotations  \
0                     NaN                        12   
1                     NaN                        10   
2                     NaN                        13   
3                     NaN                        10   
4                   

# Assigning X for short-term memorability

In [38]:
X_short=test_ground_truth_caption['Vectorized_Caption'].tolist()
X_short=pd.DataFrame(X_short)
X_short['nb_short-term_annotations']=pd.Series(test_ground_truth_caption['nb_short-term_annotations'])
print(X_short.head(10))


     0         1    2    3    4    5    6    7    8         9  \
0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
1  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
2  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.378263   
3  0.0  0.407439  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
4  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
5  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
6  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
7  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
8  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
9  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   

             ...              991  992  993  994  995      996  997  998  999  \
0            ...              0.0  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0   
1            ...              0.0  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0   
2            ...              0.0  0.0  0

# Loading Trained Model for short-term memorability

In [39]:
xgb_short=joblib.load('Model/xgb_short.xgb')

# Predicting Short-term Memorability for the test data

In [40]:
y_pred_short = xgb_short.predict(X_short)
test_ground_truth_caption['short-term_memorability']=pd.Series(y_pred_short)
print(test_ground_truth_caption.head(10))

   video  short-term_memorability  nb_short-term_annotations  \
0   7494                 0.898937                         33   
1   7495                 0.904481                         34   
2   7496                 0.889021                         32   
3   7497                 0.822171                         33   
4   7498                 0.862761                         33   
5   7499                 0.904292                         33   
6   7500                 0.862761                         33   
7   7502                 0.837407                         43   
8   7503                 0.845138                         33   
9   7504                 0.840225                         33   

   long-term_memorability  nb_long-term_annotations  \
0                     NaN                        12   
1                     NaN                        10   
2                     NaN                        13   
3                     NaN                        10   
4                   

# Assigning X for long-term memorability

In [47]:
X_long=test_ground_truth_caption['Vectorized_Caption'].tolist()
X_long=pd.DataFrame(X_long)
X_long['nb_long-term_annotations']=pd.Series(test_ground_truth_caption['nb_long-term_annotations'])
print(X_long.head(10))


     0         1    2    3    4    5    6    7    8         9  \
0  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
1  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
2  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.378263   
3  0.0  0.407439  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
4  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
5  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
6  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
7  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
8  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   
9  0.0  0.000000  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000   

             ...             991  992  993  994  995      996  997  998  999  \
0            ...             0.0  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0   
1            ...             0.0  0.0  0.0  0.0  0.0  0.00000  0.0  0.0  0.0   
2            ...             0.0  0.0  0.0  

# Loading model for long term memorability and Predicting long-term Memorability for the test data

In [48]:
xgb_long=joblib.load('Model/xgb_long.xgb')
y_pred_long = xgb_long.predict(X_long)
test_ground_truth_caption['long-term_memorability']=pd.Series(y_pred_long)
print(test_ground_truth_caption.head(10))


   video  short-term_memorability  nb_short-term_annotations  \
0   7494                 0.898937                         33   
1   7495                 0.904481                         34   
2   7496                 0.889021                         32   
3   7497                 0.822171                         33   
4   7498                 0.862761                         33   
5   7499                 0.904292                         33   
6   7500                 0.862761                         33   
7   7502                 0.837407                         43   
8   7503                 0.845138                         33   
9   7504                 0.840225                         33   

   long-term_memorability  nb_long-term_annotations  \
0                0.776987                        12   
1                0.812661                        10   
2                0.796576                        13   
3                0.775866                        10   
4                0.7

# Converting test_ground_truth_caption to CSV file

In [49]:
test_ground_truth_caption.to_csv('Memorability_Predication.csv')