In [1]:
const stemmer = require('stemmer')
const invertedIndex = require('./index/invertedIndex.json')
const wordListCollection = require('./index/wordList.json')
const invertedIndex_all = require('./index/invertedIndex_all.json')
const wordListCollection_all = require('./index/wordList_all.json')
const docList = require('./index/docList.json')
const parser = require('subtitles-parser')

In [2]:
function parseQuery (queryString) {
	let querylist = queryString.split(' ')
	querylist = querylist.map(query => stemmer(query))
	return querylist
}

In [3]:
function getDocList (queryA, ii) {
	if (ii[queryA]){
		return Object.keys(ii[queryA]['d'])
	}
	return []
}

In [4]:
function zipper (listA, listB) {
	if (listA.length === 0){
		return listB
	}
	if (listB.length === 0){
		return listA
	}
	let i = 0
	let j = 0
	let resultDoc = []
	while (i < listA.length && j < listB.length){
		if (listA[i] == listB[j]){
			resultDoc.push(listA[i])
			i++
			j++
		} else if (listA[i] > listB[j]){
			j++
		} else {
			i++
		}
	} 
	return resultDoc
}

In [102]:
function positionalIntersect (p1, p2, k)  {
	if (p1.length === 0){
		return p2
	}
	if (p2.length === 0){
		return p1
	}
        
	let i = 0
	let j = 0
	let resultDoc = []
	let oldB = 0
	while (i < p1.length){
		let p1ID = parseInt(p1[i])
		let p2ID = parseInt(p2[j])
		if (p1ID < oldB + k){
			i++
		} else if (p2ID < p1ID) {
			j++
		} else if (p2ID <= p1ID + k){
			resultDoc.push(p1ID)
			i++
			oldB = p2ID
		} else {
			i++
		}
	} 
	return resultDoc
}

In [5]:
function query (queryString)  {
	let querylist = parseQuery(queryString)
	let returnList = {}
	let returnDoc = []
	if (querylist.length === 1) {
		returnDoc = getDocList(querylist[0])
	}
	else {
		let docLists = querylist.map(query => getDocList(query))
		returnDoc = []
		docLists.forEach(docList => returnDoc = zipper(returnDoc, docList))
	}
	returnDoc.forEach(docID =>{
		let docInfo = docList[docID]
		if (!returnList[docInfo.c]){
			returnList[docInfo.c] = {}
		}
		returnList[docInfo.c][docInfo.n.replace('.srt','')] = {id:docID}
	})
	Object.keys(returnList).forEach(category=>{
		Object.keys(returnList[category]).forEach(docName=>{
			let srt = fs.readFileSync(`./public/srt/${category}/${docName}.srt`, 'utf8')
			let data = parser.fromSrt(srt, true)
			let description = ''
			for(let i=0; i < 5; i++){
				description = description + ' ' + clearHtml(data[i].text)
			}
			returnList[category][docName]['d'] = description 
		})
	})
	// console.log(returnList)
	return returnList 
}

In [6]:
function clearHtml (text) {
	return text.replace(/<(?:.|\n)*?>/gm, '')
}

In [87]:
queryString = 'persistance data structure'
querylist = parseQuery(queryString)
returnList = {}
returnDoc = []
if (querylist.length === 1) {
    returnDoc = getDocList(querylist[0], invertedIndex)
}
else {
    let docLists = querylist.map(query => getDocList(query, invertedIndex))
    returnDoc = []
    docLists.forEach(docList => returnDoc = zipper(returnDoc, docList))
}
returnDoc.forEach(docID =>{
    let docInfo = docList[docID]
    console.log(docInfo,docID)
    if (!returnList[docInfo.c]){
        returnList[docInfo.c] = {}
    }
    returnList[docInfo.c][docInfo.n.replace('.srt','')] = {id:docID}
})
Object.keys(returnList).forEach(category=>{
    Object.keys(returnList[category]).forEach(docName=>{
        let srt = fs.readFileSync(`./public/vtt/${category}/${docName}.srt`, 'utf8')
        let data = parser.fromSrt(srt, true)
        let description = ''
        for(let i=0; i < 5; i++){
            description = description + ' ' + clearHtml(data[i].text)
        }
        returnList[category][docName]['d'] = description 
    })
})
console.log(returnList)

{ n: 'MIT6_851S12_lec01_300k.srt',
  d: 1560592391320,
  c: 'Data-Structure',
  h: 'B8OLyWk' } '3'
{ n: 'NMxLL3D5qd8.srt.txt',
  d: 1560592391336,
  c: 'Data-Structure',
  h: '4WEjM77' } '4'
{ n: 'WqCWghETNDc.srt.txt',
  d: 1560592391350,
  c: 'Data-Structure',
  h: 'rYgZJl2' } '5'


Error: ENOENT: no such file or directory, open './public/vtt/Data-Structure/MIT6_851S12_lec01_300k.srt'

In [88]:
returnList = {}
returnDoc.forEach(docid => {
    returnList[docid] = {s:0, ...docList[docid]}
    querylist.forEach(stemmed => {
        if (invertedIndex[stemmed] && invertedIndex[stemmed]['d'][docid])
        returnList[docid]['s'] += invertedIndex[stemmed]['d'][docid].tfidf
    })
})

In [89]:
returnList

{ '3':
   { s: 38.03571195602715,
     n: 'MIT6_851S12_lec01_300k.srt',
     d: 1560592391320,
     c: 'Data-Structure',
     h: 'B8OLyWk' },
  '4':
   { s: 14.263391983510179,
     n: 'NMxLL3D5qd8.srt.txt',
     d: 1560592391336,
     c: 'Data-Structure',
     h: '4WEjM77' },
  '5':
   { s: 18.48958220084653,
     n: 'WqCWghETNDc.srt.txt',
     d: 1560592391350,
     c: 'Data-Structure',
     h: 'rYgZJl2' } }

In [90]:
docList

{ '0':
   { n: 'leXa7EKUPFk.srt.txt',
     d: 1560592391292,
     c: 'AI',
     h: '0LNA4oL' },
  '1':
   { n: 'MIT6_034F10_lec01_300k.srt',
     d: 1560592391310,
     c: 'AI',
     h: 'Yy22kjW' },
  '2':
   { n: 'PNKj529yY5c.srt.txt',
     d: 1560592391316,
     c: 'AI',
     h: 'vlVwMYV' },
  '3':
   { n: 'MIT6_851S12_lec01_300k.srt',
     d: 1560592391320,
     c: 'Data-Structure',
     h: 'B8OLyWk' },
  '4':
   { n: 'NMxLL3D5qd8.srt.txt',
     d: 1560592391336,
     c: 'Data-Structure',
     h: '4WEjM77' },
  '5':
   { n: 'WqCWghETNDc.srt.txt',
     d: 1560592391350,
     c: 'Data-Structure',
     h: 'rYgZJl2' } }

In [100]:
listA = invertedIndex[stemmer('persistance')]['d'][3].s
listB = (invertedIndex[stemmer('data')]['d'][3].s)
listC = (invertedIndex[stemmer('structure')]['d'][3].s)
console.log()




In [106]:
listAB = positionalIntersect(listA, listB, 5)
listBC = positionalIntersect(listB, listC, 5)
console.log()




In [107]:
positionalIntersect(listAB, listBC, 5)

[ 223, 310, 339, 392, 409, 424, 1412, 1673, 1685 ]

In [121]:
function getDescription(docid){
    let h = docList[docid]['h']
    let srt = fs.readFileSync(`./public/vtt/${h}.vtt`, 'utf8')
    let data = parser.fromSrt(srt, true)
    console.log(data)
    let description = ''
    for(let i=0; i < 5; i++){
        description = description + ' ' + clearHtml(data[i].text)
    }
    console.log(description)
}
getDescription('3')


[]


TypeError: Cannot read property 'text' of undefined

In [93]:
let docLists = querylist.map(query => getDocList(query, invertedIndex))

SyntaxError: Identifier 'docLists' has already been declared

In [94]:
returnDoc

[ '3', '4', '5' ]

In [95]:
docList

{ '0':
   { n: 'leXa7EKUPFk.srt.txt',
     d: 1560592391292,
     c: 'AI',
     h: '0LNA4oL' },
  '1':
   { n: 'MIT6_034F10_lec01_300k.srt',
     d: 1560592391310,
     c: 'AI',
     h: 'Yy22kjW' },
  '2':
   { n: 'PNKj529yY5c.srt.txt',
     d: 1560592391316,
     c: 'AI',
     h: 'vlVwMYV' },
  '3':
   { n: 'MIT6_851S12_lec01_300k.srt',
     d: 1560592391320,
     c: 'Data-Structure',
     h: 'B8OLyWk' },
  '4':
   { n: 'NMxLL3D5qd8.srt.txt',
     d: 1560592391336,
     c: 'Data-Structure',
     h: '4WEjM77' },
  '5':
   { n: 'WqCWghETNDc.srt.txt',
     d: 1560592391350,
     c: 'Data-Structure',
     h: 'rYgZJl2' } }

In [122]:
srt

ReferenceError: srt is not defined

In [134]:
fs.readFileSync(`./public/vtt/${docList[3]['h']}.vtt`, 'utf8').split("\r\n")

[ 'WEBVTT FILE',
  '',
  '1',
  '00:00:00.090 --> 00:00:02.490',
  'The following content is',
  'provided under a Creative',
  '',
  '2',
  '00:00:02.490 --> 00:00:04.030',
  'Commons license.',
  '',
  '3',
  '00:00:04.030 --> 00:00:06.360',
  'Your support will help',
  'MIT OpenCourseWare',
  '',
  '4',
  '00:00:06.360 --> 00:00:10.720',
  'continue to offer high quality',
  'educational resources for free.',
  '',
  '5',
  '00:00:10.720 --> 00:00:13.320',
  'To make a donation or',
  'view additional materials',
  '',
  '6',
  '00:00:13.320 --> 00:00:17.280',
  'from hundreds of MIT courses,',
  'visit MIT OpenCourseWare',
  '',
  '7',
  '00:00:17.280 --> 00:00:18.450',
  'at ocw.mit.edu.',
  '',
  '8',
  '00:00:21.056 --> 00:00:25.280',
  'ERIK DEMAINE: Welcome to 6.851',
  'Advanced Data Structures.',
  '',
  '9',
  '00:00:25.280 --> 00:00:26.420',
  'I am Erik Demaine.',
  '',
  '10',
  '00:00:26.420 --> 00:00:28.170',
  'You can call me Erik.',
  '',
  '11',
  '00:00:28.170 --

In [131]:
docList[3]['h']

'B8OLyWk'