# Text 1: Vector space models
**Internet Analytics - Lab 4**

---

**Group:** *K*

**Names:**

* *Mathieu Sauser*
* *Luca Mouchel*
* *Jérémy Chaverot*
* *Heikel Jebali*

---

#### Instructions

*This is a template for part 1 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [22]:
import pickle
import numpy as np
from scipy.sparse import csr_matrix, save_npz
from utils import load_json, load_pkl

import re
import pickle
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer

courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

[nltk_data] Downloading package punkt to /home/jebali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jebali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Exercise 4.1: Pre-processing

In [23]:
freqs = {}

for i, course in enumerate(courses):
    description = course['description']
    description = [char.lower() for char in description]
    description = ''.join(description)

    # Step 4: Remove punctuation marks
    description = re.sub(r'[^\w\s]', '', description)

    # Step 5: Tokenize the text into words
    tokens = nltk.word_tokenize(description)
    
    # Step 6: Remove stopwords
    tokens = [token for token in tokens if token not in stopwords]                        
        
    # Step 7: Stem or lemmatize words
    #stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    lemmatizedTokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    #stemmedTokens = [stemmer.stem(token) for token in tokens]
    
    #freqDist = FreqDist(stemmedTokens)
    freqDist = FreqDist(lemmatizedTokens)
    for token in lemmatizedTokens:
        if token not in freqs.keys():
            freqs[token] = freqDist[token]
        else:
            freqs[token] += freqDist[token]

    # Step 9: Add n-grams to the vocabulary
    nGramRange = (2, 3)  # Specify the range of n-grams to consider
    
    ngrams = []
    for i in range(nGramRange[0], nGramRange[1] + 1):
        ngrams.extend(list(nltk.ngrams(lemmatizedTokens, i)))
        
    ngrams = [' '.join(ngram) for ngram in ngrams]
    for ngram in ngrams:
        if ngram not in freqs.keys():
            freqs[ngram] = 1
        else:
            freqs[ngram] += 1
            
    vocabulary = lemmatizedTokens + ngrams
    
    course['description'] = vocabulary

In [24]:
mostFreq = float('-inf')
mostFreqToken = ''
for token, freq in freqs.items():
    if freq > mostFreq: 
        mostFreq = freq
        mostFreqToken = token

print(f'**{mostFreqToken}** is the most used (lemmatized) word, with {mostFreq} apparitions')

**student** is the most used (lemmatized) word, with 9887 apparitions


In [25]:
freqWords = [word for word in freqs.keys() if freqs[word] > mostFreq * 0.6]
infreqWords = [word for word in freqs.keys() if freqs[word] < 4]

print(f'Most frequent words: {freqWords}')

Most frequent words: ['student', 'method', 'system']


In [26]:
for course in courses:
    description = course['description']
    course['description'] = [word for word in description if word not in freqWords and word not in infreqWords]
    
    if course['courseId'] == 'COM-308':
        print(sorted(course['description']))

['20', '20 midterm', '30', '30 final', '30 final exam', '50', 'acquired', 'activity', 'activity lecture', 'ad', 'ad', 'algebra', 'algebra', 'algorithm', 'algorithm', 'algorithm data', 'algorithm data structure', 'analysis', 'analytics', 'analytics', 'application', 'application', 'assessment', 'assessment method', 'assessment method project', 'auction', 'auction', 'balance', 'based', 'based', 'basic', 'basic', 'basic', 'basic linear', 'basic linear algebra', 'basic material', 'basic model', 'cathedra', 'chain', 'class', 'class', 'class', 'cloud', 'clustering', 'clustering', 'collection', 'com300', 'combination', 'communication', 'communication com300', 'community', 'community', 'computing', 'computing', 'concept', 'concept', 'concept start', 'concrete', 'content', 'content class', 'course', 'course', 'course basic', 'course stochastic', 'course stochastic model', 'coverage', 'current', 'data', 'data', 'data', 'data', 'data', 'data', 'data mining', 'data mining', 'data mining', 'data str

## Exercise 4.2: Term-document matrix

In [27]:
termToIdx = {}
for i, term in enumerate(freqs.keys()):
    termToIdx[term] = i
    
docToIdx = {}
for i, course in enumerate(courses):
    docToIdx[course['courseId']] = i

In [28]:
M = len(termToIdx.keys())
N = len(docToIdx.keys())
TD = np.zeros((M, N))

for course in courses:
    for term in course['description']:
        TD[termToIdx[term], docToIdx[course['courseId']]] += 1
        
termsPerDoc = np.sum(TD, axis=0)
TF = TD / termsPerDoc
IDF = np.log2(N / np.count_nonzero(TD, axis=1))
TFIDF = np.transpose(np.transpose(TF) * IDF)

  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [29]:
np.save('./TFIDF.npy', TFIDF)
np.save('./termToIdx.npy', termToIdx)
np.save('./docToIdx.npy', docToIdx)

In [30]:
ixScores = np.argsort(-TFIDF[:, docToIdx['COM-308']])
top15idx = ixScores[:15]

print('------ Top 15 terms with highest TF-IDF scores in IX course ------\n')
print('------------------')
print('Term: Score')
print('------------------')
for idx in top15idx:
    for term, idx2 in termToIdx.items():
        if idx == idx2:
            print(f'{term}: {TFIDF[idx, docToIdx["COM-308"]]:.4f}')

------ Top 15 terms with highest TF-IDF scores in IX course ------

------------------
Term: Score
------------------
online: 0.0893
realworld: 0.0892
social: 0.0835
data mining: 0.0806
explore: 0.0775
mining: 0.0732
networking: 0.0697
hadoop: 0.0633
largescale: 0.0624
recommender system: 0.0591
ecommerce: 0.0591
recommender: 0.0591
service: 0.0567
auction: 0.0561
datasets: 0.0537


## Exercise 4.3: Document similarity search