# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** *P*

**Names:**

* *Pierre Fouche*
* *Matthias Leroy*


---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [None]:
import pickle
import numpy as np
from utils import save_json
from scipy.sparse import csr_matrix
from utils import load_json, load_pkl
import string
import math
import collections
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer

courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

## Exercise 4.1: Pre-processing

In [None]:
def bitrigrams(l):
    zipO1 = list(zip(l, l[1:]))
    zipO2 = list(zip(l, l[1:], l[2:]))   
    returnl1 = [str(tup[0])+' '+str(tup[1]) for tup in zipO1]
    returnl2 = [str(tup[0])+' '+str(tup[1])+' '+str(tup[2]) for tup in zipO2]  
    return l+returnl1+returnl2

l1=[1,2,3,4,5,6,7,8]
print(bitrigrams(l1))

In [None]:
def removeInfrequentWords(wordsList):
    count = collections.Counter(wordsList)
    for key,value in count.items():
        if value == 1:
            wordsList.remove(key) 
    return wordsList

In [None]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
ls = LancasterStemmer()
translator = str.maketrans('', '', string.punctuation)
newCourses =[]

#test = wnl.lemmatize('studing')
#print(test)

for course in courses:
    temp = course['description'].lower()
    temp = temp.translate(translator)
    temp = temp.split(' ')
    temp=[word for word in temp if word not in stopwords]
    temp=[ps.stem(word) for word in temp]
    temp = removeInfrequentWords(temp)
    temp = bitrigrams(temp)
    temp = removeInfrequentWords(temp)
    #print(temp)
    newCourses.append({'name':course['name'],'listDescription':temp,'courseId':course['courseId'], 'description':course['description']})

save_json(newCourses, 'courses.txt')


In [None]:
terms =[]
for item in newCourses:
    terms.extend(item['listDescription'])
countTerms = collections.Counter(terms)
termsDict ={}
for i,term in enumerate(countTerms.keys()):
    termsDict[i]=term
termsDict = dict(collections.OrderedDict(sorted(termsDict.items())))
nb_terms = len(termsDict)
print(nb_terms)

newCoursesDict = {}
for i,doc in enumerate(newCourses):
    newCoursesDict[i]=doc
newCoursesDict = dict(collections.OrderedDict(sorted(newCoursesDict.items())))
nb_courses = len(newCoursesDict)
print(nb_courses)

def countDocWithTerm(term,docs):
    result = 0
    for doc in docs:
        if term in doc['listDescription']:
            result += 1
    return result

In [None]:
values =[]
rows=[]
columns=[]

for i,term in termsDict.items():
    for index,doc in newCoursesDict.items():
        if term in doc['listDescription']:
            
            tf = doc['listDescription'].count(term)/len(doc['listDescription'])
            idf = math.log(nb_courses/countDocWithTerm(term,newCourses))
            tf_idf=tf*idf
            
            values.append(tf_idf)
            rows.append(i)
            columns.append(index)
            
X = csr_matrix((values, (rows, columns)), shape=(len(terms), len(newCourses)))

with open("matrix.pickle", "wb") as f:
    pickle.dump(X, f)
        

## Exercise 4.2: Term-document matrix

In [None]:
print(X.count_nonzero())
Xarray = X.toarray()
a = 0
for key,value in newCoursesDict.items():
    if value['name'] == 'Internet analytics':
        a = key
        break;

idxIX = np.argsort(Xarray[:,a])[::-1][:15]
b=0
for key,value in termsDict.items():
    if value == 'system':
        b = key
        break;
print(Xarray[b][a])

for i in idxIX:
    print(termsDict[i])

## Exercise 4.3: Document similarity search

In [None]:
'''
fb = 0
mc = 0
for key,value in termsDict.items():
    if value==ps.stem('facebook'):
        fb = key
    elif value==ps.stem('markov chains'):
        mc = key
'''

def similarity(a,b):
    sim = (np.dot(a.T,b))/(np.linalg.norm(a)*np.linalg.norm(b))
    return sim

In [None]:
import itertools

def query(q):
    cosSim = {}
    idxCourses = []
    
    for i,doc in newCoursesDict.items():        
        if ps.stem(q) in doc['listDescription']:
            print(doc['name'])
            idxCourses.append(i)
    print('------------------------------')
    combi = itertools.combinations(idxCourses, 2)
    
    for idx in combi:
        a = Xarray[:,idx[0]]
        b = Xarray[:,idx[1]]
        cosSim[idx] = similarity(a,b)

    npCosSim = np.array(list(cosSim.values()))
    idxQuery = np.argsort(npCosSim)[::-1][:5]
    
    for j in idxQuery:
        tup = list(cosSim.keys())[j]
        for i,doc in newCoursesDict.items():
            if i==tup[0]:
                print(doc['name'])
            elif i==tup[1]:
                print(doc['name'])
        print('------------------------------')

In [None]:
def query2(l):
    print(ok)
    
print(ps.stem('marko')

In [None]:
query('markov chain')

In [None]:
query('facebook')

In [None]:
query('computer')