### 大作业：二分类问题
#### 问题描述：
给定video 和 sentence，判断二者的相关度
$$
f(v, s)=\begin{cases}
relevant,&res>0 \\ irrelevant,&otherwise\cr
\end{cases}
$$
#### 问题思路：
找到一个公共的空间将二者联系起来<br>
__从sentence中提取keywords，依据名词，动词等形式分类，产生正负样本__
#### 定义
$V={w_0,w_1,...,w_{m-1}}$是自定义的一个字典

对于一个视频样本$x$，$bow(x,i)$表示的是词$w_i$与该视频内容的相关度，$i=0,1,2,...,m-1$

对于一个句子$s$，根据字典可以得到它的$bow(s)$

视频与句子的相似度可通过计算对应的$bow$向量，即$bow(x)$和$bow(s)$，再由欧氏距离或余弦距离得到

给定文档msrvtt10k.parser.txt和作业四中的msrvtt10k.caption.txt基本一致，不同的是video对应的句子里所有的单词都标注了词性。

__因此在作业四中的一些处理也可以用于该问题(如计算video的特征向量)__

In [5]:
msrvtt10k = "E:\\msrvtt10k\\TextData\\msrvtt10k.caption.txt"
msrvtt10k_feature = "E:\\msrvtt10k\\FeatureData\\resnext101"
tv2016train_A = "E:\\tv2016train\\TextData\\tv2016train.setA.txt"
tv2016train_B = "E:\\tv2016train\\TextData\\tv2016train.setB.txt"
tv2016train_feature = "E:\\tv2016train\\FeatureData\\resnext101"
tv2016test_A = "E:\\tv2016test\\TextData\\tv2016test.setA.txt"
tv2016test_B = "E:\\tv2016test\\TextData\\tv2016test.setB.txt"
tv2016test_feature = "E:\\tv2016test\\FeatureData\\resnext101"

In [10]:
import re
from sklearn.feature_extraction.text import CountVectorizer

#sklearn 自带的词频统计工具countVectorizer
vectorizer = CountVectorizer()
corpus = []
a = []
#将txt中每行句子做分隔，只保留第一个空格的后半部分
def pick_key(x):
    tmp = x.split(" ",1)
    corpus.append(tmp[1])
    #a =[i[:i.find(":")] for i in tmp[1].split()]
    #list = [' '.join(a)]
    #print(list)
    #corpus.append(list[0])
    
with open(msrvtt10k ) as p1:
    for caption in p1.readlines():
        caption = caption.strip('\n')
        pick_key(caption)
#print(corpus)

In [3]:
X = vectorizer.fit_transform(corpus)

In [4]:
vocal = vectorizer.vocabulary_

In [5]:
def count_word(vocal, corpus):
    des = vocal
    for key in des:
        des[key] = 0
    for sentence in corpus:
        for word in sentence.split():
            if word in des:
                des[word] += 1
    return des

#得到初始字典V1
V1 = count_word(vocal, corpus)
#print(V1)
#去除停用词
from nltk.corpus import stopwords
filter_V = [word for word in V1 if word not in stopwords.words('english')]
#print(filter_V)

In [6]:
for key in list(V1.keys()):
    if key in filter_V:
        continue
    else:
        del V1[key]
#print(V1)
#筛选频数在前5000的，得到最终字典V
V={}
ls = sorted(V1.items(), key=lambda d: d[1],reverse=True)
V = dict(ls[:5000])
#print(V)

### 字典建立完毕

接下来使用SVM进行模型训练和预测

需要的输入有 X, y

__X为样本，y为对应的label__

在这个问题里，__X具体对应2048维的特征向量__，而y 则是每个视频的句子相对于字典的bow向量。<br>给出的集合里，一个video对应一个句子，所以__直接求句子的bow(s)即可。__

In [7]:
#找label
import numpy as np
from sklearn import svm

train_len = 200
test_len = 1915
feature_size = 2048

#使用一个空的list sen_tmp， 用来存储每个句子的单词
sen_tmp = []
#label_y用来存储每个句子中是否出现字典中的词，出现为1，不出现为0
label_y = np.zeros((train_len, len(V)),dtype=np.int)

tmp_y = []
def select_key(x):
    tmp = x.split(" ",1)
    tmp_y.append(tmp[1])
    #a =[i[:i.find(":")] for i in tmp[1].split()]
    #print(a)
    #list = [' '.join(a)]
    #print(list)
    #tmp_y.append(list[0])
    
with open(tv2016train_A ) as p1:
    for i in p1.readlines():
        i = i.strip('\n')
        select_key(i)
        
dict_keys = list(V.keys())
#存字典中的key元素，为下面的匹配做准备
#print(dict_keys)
print(len(tmp_y))

200


In [8]:
#对每个video找到其对应的标注：判断是否有字典V中的Key出现
for i in range(len(tmp_y)):
    sen_tmp = tmp_y[i].split()
    #print(sen_tmp)
    for j in range(0,len(V)):
        if dict_keys[j] in sen_tmp:   
            #print(dict_keys[j])
            label_y[i][j] = 1
            #句子有词出现在字典中
        else:
            label_y[i][j] = 0

In [9]:
#找fit X 
#维数应该是200*2048
#创建一个空的二维list，用来存储视频的特征向量(2048维)

feature_train = np.zeros((train_len, feature_size), dtype=np.double)
feature_test = np.zeros((test_len, feature_size), dtype=np.double)
# (1915, 2048)

video_train_id = []
video_test_id = []

for i in range(1, train_len+1):
    video_train_id.append(tv2016train_A+str(i)) 
for i in range(1, test_len+1):
    video_test_id.append(tv2016test_A+str(i)) 

In [10]:
# 从Bigfile.py 直接复制而来
# 用于计算train 和 test集里各个video 的2048维特征向量

import os, sys, array

class BigFile:

    def __init__(self, datadir):
        self.nr_of_images, self.ndims = map(int, open(os.path.join(datadir,'shape.txt')).readline().split())
        id_file = os.path.join(datadir, "id.txt")
        self.names = open(id_file).read().strip().split()
        assert(len(self.names) == self.nr_of_images)
        self.name2index = dict(zip(self.names, range(self.nr_of_images)))
        self.binary_file = os.path.join(datadir, "feature.bin")
        print ("[%s] %dx%d instances loaded from %s" % (self.__class__.__name__, self.nr_of_images, self.ndims, datadir))


    def read(self, requested, isname=True):
        requested = set(requested)
        if isname:
            index_name_array = [(self.name2index[x], x) for x in requested if x in self.name2index]
        else:
            assert(min(requested)>=0)
            assert(max(requested)<len(self.names))
            index_name_array = [(x, self.names[x]) for x in requested]
        if len(index_name_array) == 0:
            return [], []
       
        index_name_array.sort(key=lambda v:v[0])
        sorted_index = [x[0] for x in index_name_array]

        nr_of_images = len(index_name_array)
        vecs = [None] * nr_of_images
        offset = np.float32(1).nbytes * self.ndims
        
        res = array.array('f')
        fr = open(self.binary_file, 'rb')
        fr.seek(index_name_array[0][0] * offset)
        res.fromfile(fr, self.ndims)
        previous = index_name_array[0][0]
 
        for next in sorted_index[1:]:
            move = (next-1-previous) * offset
            #print next, move
            fr.seek(move, 1)
            res.fromfile(fr, self.ndims)
            previous = next

        fr.close()

        return [x[1] for x in index_name_array], [ res[i*self.ndims:(i+1)*self.ndims].tolist() for i in range(nr_of_images) ]


    def read_one(self, name):
        renamed, vectors = self.read([name])
        return vectors[0]    

    def shape(self):
        return [self.nr_of_images, self.ndims]


class StreamFile:

    def __init__(self, datadir):
        self.feat_dir = datadir
        self.nr_of_images, self.ndims = map(int, open(os.path.join(datadir,'shape.txt')).readline().split())
        id_file = os.path.join(datadir, "id.txt")
        self.names = open(id_file).read().strip().split()
        assert(len(self.names) == self.nr_of_images)
        self.name2index = dict(zip(self.names, range(self.nr_of_images)))
        self.binary_file = os.path.join(datadir, "feature.bin")
        print ("[%s] %dx%d instances loaded from %s" % (self.__class__.__name__, self.nr_of_images, self.ndims, datadir))
        self.fr = None
        self.current = 0
    
    def open(self):
        self.fr = open(os.path.join(self.feat_dir,'feature.bin'), 'rb')
        self.current = 0

    def close(self):
        if self.fr:
            self.fr.close()
            self.fr = None
        
    def __iter__(self):
        return self
        
    def next(self):
        if self.current >= self.nr_of_images:
            self.close()
            raise StopIteration
        else:
            res = array.array('f')
            res.fromfile(self.fr, self.ndims)
            _id = self.names[self.current]
            self.current += 1
            return _id, res.tolist() 

In [11]:
if __name__ == '__main__':
    bigfile = BigFile(tv2016train_feature)
    for i in range(0, train_len):
        imset = [video_train_id[i]]
        renamed, vectors = bigfile.read(imset)
        j = 0
        for name,vec in zip(renamed, vectors):
            feature_train[i] = vec
            j = j+1
    #print(feature_train)
    #训练集的特征向量
    
    bigfile = BigFile(tv2016test_feature)
    for i in range(0, test_len):
        imset = [video_test_id[i]]
        renamed, vectors = bigfile.read(imset)
        j = 0
        for name,vec in zip(renamed, vectors):
            feature_test[i] = vec
            j = j+1
   #print(feature_test)
  # 测试集的特征向量

[BigFile] 200x2048 instances loaded from E:\tv2016train\FeatureData\resnext101
[BigFile] 1915x2048 instances loaded from E:\tv2016test\FeatureData\resnext101


In [12]:
 from sklearn import svm
train_x = np.array(feature_train) #放入所有样本
train_y = np.zeros((train_len, len(V)) , dtype=np.int)

test_x = np.array(feature_test)
# (1915, 2048)
test_y = np.zeros((test_len,len(V)), dtype=np.int)
predict_y = np.zeros((len(V),test_len), dtype=np.int)


train_y = label_y.T
print(train_x.shape)
print(train_y.shape)
print(predict_y.shape)

import random

for i in range(0, len(V)):
    k = 0
    for j in range(0, train_len):
        if train_y[i][j] == 0:
            k += 1
    if k == train_len:
        train_y[i][random.randint(0,train_len-1)]=1
       # print(train_y[i])

(200, 2048)
(5000, 200)
(5000, 1915)


In [13]:
for i in range(0, train_len):
    y = np.array(train_y[i])
    clf = svm.SVC()
    clf.fit(train_x, y)
    #预测test的bow值
    predict_y[i] = clf.predict(test_x) #基于SVM对验证集做出预测
    #print (dict_keys[i],predict_y[i])
    
print(predict_y.shape)

(5000, 1915)


#### MIR思路：
1.对于test_set中的一个视频video_i，其有唯一对应的句子Sentence_i，计算video_i与test_set中所有句子的相似度

2.排序，其本身对应的句子Sentence_i的正序排名的倒数就是该视频 i 的MIR值

3.求出所有视频的MIR，取平均值

备注：此处的相似度用Jaccard 距离衡量

In [14]:
# test_label 是 test_setA 中1915个video的bow(s)，每个vieo_i对应一个bow(s_i)
# label_y 是 train_setA 中1915个video的bow(s)
from sklearn.metrics import jaccard_similarity_score as jac

MIR = np.zeros(test_len, dtype=np.double)
tmp_mir = {}
res_mir = 0.0
new_sort = []
k = 0.0

yy = predict_y.T

for i in range(0, test_len):
    v1 = np.array(yy[i])
    for j in range(0,test_len):
        v2 = np.array(yy[j])
        tmp_mir[j] = jac(v1,v2)
        #计算video_i与test_set中所有句子的相似度
    new_sort = sorted(tmp_mir.items(), key=lambda d: d[1],reverse=True)
    #对相似度排序
    for x in range(0, len(new_sort)):
        if i == new_sort[x][0]:
            x = x+1
            k += 1/x
            #取本身对应的句子的排名的倒数，得到video_i 的MIR
            break

In [15]:
res_mir = k/1915
print(res_mir)

0.004248015478014735
