In [1]:
#Importing all Necessary libraries and packages

import nltk
import string
import json
import pandas
import numpy as np

from nltk import word_tokenize
from nltk.corpus import stopwords
from unidecode import unidecode
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
"""
Function pre_process()
Parameters- 
    1. corpus: Data that needs to be preprocessed

Preprocessing data to convert all the sentences into lower case, to remove the stop words, non-ascii characters
and punctuation
"""

def pre_process(corpus):
    # convert input corpus to lower case.
    corpus = corpus.lower()
    # collecting a list of stop words from nltk and punctuation form
    # string class and create single array.
    stopset = stopwords.words('english') + list(string.punctuation)
    # remove stop words and punctuations from string.
    # word_tokenize is used to tokenize the input corpus in word tokens.
    corpus = " ".join([i for i in word_tokenize(corpus) if i not in stopset])
    # remove non-ascii characters
    corpus = unidecode(corpus)
    return corpus

In [3]:
#Lemmatizing the data input

lemmatizer = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:                    
    return None

def lemmatize_sentence(sentence):
  nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))    
  wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)

  res_words = []
  for word, tag in wn_tagged:
    if tag is None:                        
      res_words.append(word)
    else:
      res_words.append(lemmatizer.lemmatize(word, tag))

  return " ".join(res_words)

In [4]:
#Import the data file
with open("TestDf.json", 'r') as f:
    datastore = json.load(f)
    
corpus = []
video_id = []

#Collection of all the captions
for item in datastore:
    corpus.append(item['caption'])

In [5]:
# sentence pair

#for c in range(len(corpus)):
#    corpus[c] = pre_process(corpus[c])
#    corpus[c] = lemmatize_sentence(corpus[c])
#    print(corpus[c])

In [5]:
# creating vocabulary using uni-gram and bi-gram
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf_vectorizer.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [6]:
# Importing the two csv files as dataframes (the original and the modified one)
df1 = pandas.read_csv('LongestDf.csv')
df2 = pandas.read_csv('Frames_caption.csv')

In [7]:
df1 = df1.rename(columns = {"video_id": "vid1"}) 
df1 = df1.rename(columns = {"caption": "cap1"}) 

df2 = df2.rename(columns = {"VideoID": "vid2"}) 
df2 = df2.rename(columns = {"Caption": "cap2"}) 

In [8]:
print(df1.head())

                  vid1                                               cap1
0    mv89psg6zh4_33_46  A bird is standing in a sink drinking water th...
1     ZbzDGXEwtGc_6_15  A clip showing an south african airline plane ...
2     g36ho6UrBz0_5_20  A man playing his electric guitar at a home pa...
3  04Gt01vatkk_248_265  A woman is chopping an onion slice into fine p...
4  buJ5HDCinrM_150_166  A woman applies a concealer to the lower porti...


In [9]:
print(df2.head())

                vid2  FStart  FEnd  \
0  -_aaMGK6GGw_57_61       0     3   
1  -_aaMGK6GGw_57_61       3     4   
2    -_hbPLsZvvo_5_8       0     1   
3    -_hbPLsZvvo_5_8       1     3   
4  -_hbPLsZvvo_18_25       0     5   

                                                cap2  Video_Duration  COS  
0  A man grabbed a young boy from his collar and ...               4    2  
1  A woman is waking up a man sleeping on a chair...               4    2  
2  A dog is barking and cooking with dog is written.               3    2  
3  A lady is standing in a kitchen with a dog bes...               3    2  
4  A person cuts a zucchini into small pieces and...               7    2  


In [10]:
#Using the Left-Join function to merge the two dataframes to create a new dataframe named 'merged_left'

merged = pandas.merge(left=df2, right=df1, how='left', left_on='vid2', right_on='vid1')
merged

Unnamed: 0,vid2,FStart,FEnd,cap2,Video_Duration,COS,vid1,cap1
0,-_aaMGK6GGw_57_61,0,3,A man grabbed a young boy from his collar and ...,4,2,-_aaMGK6GGw_57_61,A man and woman are yelling at a young boy and...
1,-_aaMGK6GGw_57_61,3,4,A woman is waking up a man sleeping on a chair...,4,2,-_aaMGK6GGw_57_61,A man and woman are yelling at a young boy and...
2,-_hbPLsZvvo_5_8,0,1,A dog is barking and cooking with dog is written.,3,2,-_hbPLsZvvo_5_8,a dog barking and cooking with her master in t...
3,-_hbPLsZvvo_5_8,1,3,A lady is standing in a kitchen with a dog bes...,3,2,-_hbPLsZvvo_5_8,a dog barking and cooking with her master in t...
4,-_hbPLsZvvo_18_25,0,5,A person cuts a zucchini into small pieces and...,7,2,-_hbPLsZvvo_18_25,A woman cuts a piece of zucchini in half and t...
...,...,...,...,...,...,...,...,...
379,6Lzq1HN33lE_5_15,0,10,A man is playing flute.,10,1,6Lzq1HN33lE_5_15,A man is playing the title song of 'The Titani...
380,6mYnZbIwcNo_10_16,0,6,A man is typing using an external keyboard att...,6,1,6mYnZbIwcNo_10_16,Someone types on a black keyboard placed in fr...
381,6njscWrdnM0_8_18,0,5,A man practicing kicks on a toy torso dummy fa...,10,2,6njscWrdnM0_8_18,Each time a man karate kicks a mannequin he fa...
382,6njscWrdnM0_8_18,5,10,The man is punching the dummy.,10,2,6njscWrdnM0_8_18,Each time a man karate kicks a mannequin he fa...


In [48]:
#To check if there is any row 

merged[ pandas.isnull(merged.vid1)]

Unnamed: 0,vid2,FStart,FEnd,cap2,Video_Duration,COS,vid1,cap1


In [11]:
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [12]:
merged["fsim"] = 0
for i, row in merged.iterrows():
    captions = []
    captions.append(row['cap1'])
    captions.append(row['cap2'])
    
    for c in range(len(captions)):
        captions[c] = pre_process(captions[c])
        captions[c] = lemmatize_sentence(captions[c])
    
    feature_vectors = tfidf_vectorizer.transform(captions)

    fsims = get_cosine_similarity(feature_vectors[0], feature_vectors[1])
    
    merged['FWeight'] = (merged['FEnd'] - merged['FStart'])/merged['Video_Duration']
    merged['fsim'].iloc[i] = fsims

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

In [13]:
#print(merged[370:])
df = merged[['vid2', 'COS', 'fsim', 'FWeight']]

print(df)

                    vid2  COS      fsim   FWeight
0      -_aaMGK6GGw_57_61    2  0.253792  0.750000
1      -_aaMGK6GGw_57_61    2  0.192565  0.250000
2        -_hbPLsZvvo_5_8    2  0.562707  0.333333
3        -_hbPLsZvvo_5_8    2  0.179969  0.666667
4      -_hbPLsZvvo_18_25    2  0.275962  0.714286
..                   ...  ...       ...       ...
379     6Lzq1HN33lE_5_15    1  0.363582  1.000000
380    6mYnZbIwcNo_10_16    1  0.365895  1.000000
381     6njscWrdnM0_8_18    2  0.187248  0.500000
382     6njscWrdnM0_8_18    2  0.234586  0.500000
383  6owu8Mow0_g_275_280    1  0.054464  1.000000

[384 rows x 4 columns]


In [15]:
df2 = df.groupby('vid2')
similarity=[]

for group in df2:
    similarity.append(np.sum(group[0]['fsim']*group[1]['FWeight'])/ np.sum(group[0]['FWeight']))

TypeError: string indices must be integers

In [73]:
similarity

[0.25367833660690065,
 0.23848527400984962,
 0.1552365504098244,
 0.19095812593746767,
 0.08536556078423259,
 0.2686418774855528,
 0.07459927353355118,
 0.12411623366167682,
 0.04686713609029747,
 0.2310768921727494,
 0.0815634429516726,
 0.3075485857496027,
 0.0,
 0.3969145681286697,
 0.05205375983965845,
 0.0,
 0.11233857185897762,
 0.050873340417418526,
 0.2979650330066005,
 0.109913255987297,
 0.0,
 0.12691791988607173,
 0.0,
 0.11161322231806252,
 0.10848830178041337,
 0.09476925250715658,
 0.01805436778379081,
 0.045253259953134416,
 0.23395563400156777,
 0.03557984208083792,
 0.48826596395432864,
 0.25450915412991987,
 0.16171646623434255,
 0.011959947277015817,
 0.07694396326654541,
 0.14620979973435544,
 0.10818518553416015,
 0.061655129406735315,
 0.0,
 0.009582896073745684,
 0.24131619886213532,
 0.0,
 0.07009330349581189,
 0.08773591319248965,
 0.15955502909167135,
 0.5202534284386159,
 0.10069341827671938,
 0.23548987537388122,
 0.05811976851891663,
 0.0,
 0.0,
 0.20184949

In [16]:
vems_score = 0.0

video_scores = []

for i, row in merged.iterrows():
    vid_score = 0.0
    total_weight = 0.0

    for j in range(row['COS']):
        total_weight = total_weight + row['FWeight']
        vid_score = vid_score + (row['FWeight'] * row['fsim'])
    
    i = i + row['COS']
    
    vid_score = vid_score/total_weight
    video_scores.append(vid_score)
    
print(video_scores[:3])

[0.2537920175770068, 0.19256504330837795, 0.5627073057701637]
