#Imports

In [1]:
import numpy as np
import time
import scipy
from scipy.sparse import linalg as splinalg
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import tracemalloc
from scipy.sparse import csr_matrix
from google.colab import files
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import copy
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Document Word Matrix Setup

In [2]:
#all unique words and list of docs in corpus
def Unique():
    #enwiki8 doc
    #Getting count of all words
    uniquecount = {}
    documents = []
    with open('/content/drive/My Drive/enwiki8.txt', 'r',encoding='utf-8') as file:
        for line in file:
          #wordfreq = {}
          for word in line.strip().split():
            #if word not in stopwords:
            if word in uniquecount:
              uniquecount[word] +=1
            else:
              uniquecount[word] = 1
          #document = list(wordfreq.values())
          documents.append(line.strip().lower())
    return uniquecount, documents
#Top10k~ words + humanscores
def Top10k(terms,stopwords):
    #Getting top 10k words
    #print(len(top10k))
    #print(top10k)
    humanscores = []
    hwords = {}
    with open("/content/drive/My Drive/wordsim353_human_scores.txt",'r',encoding='utf-8') as file:
        for line in file:
            humanscores.append(line.strip().split())
            word1, word2, sc = line.strip().split()
            if(word1 in uniquecount and word1 not in hwords):
              hwords[word1] = uniquecount[word1]
              #top10k[word1] = uniquecount[word1]
            if(word2 in uniquecount and word2 not in hwords):
              hwords[word2] = uniquecount[word2]   
              #top10k[word2] = uniquecount[word2]           
      
    for x in terms.keys():
      if x in stopwords and x not in hwords:
        terms[x] = 0          
    top10k = dict(sorted(terms.items(), key=lambda x:x[1], reverse=True)[:10000])
    for word in hwords:
      top10k[word] = hwords[word]
    top10k = dict(sorted(top10k.items(), key=lambda x:x[1], reverse=True))
    #assigning row to each word in the top 10k words
    return top10k, {k:v for v,k in enumerate(list(top10k.keys()))}, humanscores


starttime = time.time()
cv = CountVectorizer(stop_words="english",max_features=10000)
stop = cv.get_stop_words()
print("Part 1 started... ")
uniquecount,docs = Unique()
print("Unique count of all terms obtained")
top10k, wordrows, humansc = Top10k(uniquecount,stop)
cv.stop_words=None
cv.vocabulary=top10k.keys()
print("Top 10,000 terms and row# for all words obtained")
tf = TfidfTransformer()
docwords = cv.fit_transform(docs)
#dwf = (tf.fit_transform(docwords)).astype(float).T
dwf = csr_matrix(docwords).astype(float).T
print("Sparse Documemt Word Frequency Matrix obtained, shape of matrix: ")
print(dwf.shape)
print(scipy.sparse.issparse(dwf))
print("Part 1 completed")

Part 1 started... 
Unique count of all terms obtained
Top 10,000 terms and row# for all words obtained
Sparse Documemt Word Frequency Matrix obtained, shape of matrix: 
(10076, 489860)
True
Part 1 completed


In [3]:
#Obtaining a list of indices, seperated so it did not slow down testing
indices = list(zip(*dwf.nonzero()))
print("List of nonzero indices obtained.")

List of nonzero indices obtained.


#PPMI Alteration

In [27]:
#PPMI
#To reset values from TFIDF to normal
dwf = csr_matrix(docwords).astype(float).T
f = np.empty(dwf.shape)
rowsums = np.sum(dwf,axis=1)
colsums = np.sum(dwf,axis=0)
print(colsums.shape)
print(rowsums.shape)
dwfsum = np.sum(dwf)
#dwfsum = np.sum(rowsums)
#sum of all columns in rows i
pi = colsums/dwfsum
#sum of all rows in columns j
pj = rowsums / dwfsum
for r,c in indices:
    pij = dwf[r,c]/dwfsum
    ppmi = np.log(pij/(pi[:,c]*pj[r]))
    dwf[r,c] = max(0,ppmi)

(1, 489860)
(10076, 1)


#SVD Factorization

In [8]:
#2 Factorize with SVD
print("Part 2 started... ")
for k in (20,50,100):
    #tracemalloc.start()
    starttime = time.time()
    svdu,svds,svdvh= splinalg.svds(dwf, k=k)

    print("When K = " + str(k) + ":" )
    print("U shape: ", svdu.shape)
    print("S shape: ",svds.shape)
    print("V shape: ",svdvh.shape)
    #print("s:")
    #print(s)
    runtime = time.time()-starttime
    print("Runtime: " + str(runtime))
    humandistances = []
    svddistances = []
    #computing Pearson correlation coefficient between cosine distances and human scores.
    for word1, word2, score in humansc:
      #print(word1,word2,score)
      w1row = wordrows[word1] 
      w2row = wordrows[word2]
      humanscore = float(score)
      sw1 = svdu[w1row]
      sw2 = svdu[w2row]
      sdistance = np.abs(sw1.dot(sw2.T)) / np.abs(np.linalg.norm(sw1)*np.linalg.norm(sw2))
      humandistances.append(humanscore)
      svddistances.append(sdistance)
    print("PCC: ", np.corrcoef(humandistances,svddistances)[0][1])

print("Part 2 completed")

Part 2 started... 
When K = 20:
U shape:  (10076, 20)
S shape:  (20,)
V shape:  (20, 489860)
Runtime: 3.622018814086914
PCC:  0.3735864469048904
When K = 50:
U shape:  (10076, 50)
S shape:  (50,)
V shape:  (50, 489860)
Runtime: 6.775550842285156
PCC:  0.4404108169693773
When K = 100:
U shape:  (10076, 100)
S shape:  (100,)
V shape:  (100, 489860)
Runtime: 14.677078247070312
PCC:  0.5123596681353468
Part 2 completed


#SGD Factorization

In [14]:
print("Part 3 started.")
krates = {20:5e-2,50:5e-2,100:1e-2}
regs =  {20:1e-1,50:1e-2,100:1e-3}
kconv = {20:(1.1,0.97), 50:(1.07,0.99), 100:(1.1,0.95)}
#convs = [(1.01,0.98), (1.0025,0.99)]
#convs = (1.02,0.98)
np.seterr('raise')
for k in (20,50,100):
    #for ratemult in convs:
    v = np.random.rand(k,dwf.shape[1])
    u = np.random.rand(dwf.shape[0], k)
    starttime = time.time()
    convs = (1.05,0.95)
    loss = [1e10]
    cont = True
    lambdaID = 1e-5* np.ones((k,k))
    rate = krates[k]
    t=0
    while cont:
        # #SVD 
        errsum = 0
        for num in range(5000):
          try:
            r,c =  indices[np.random.choice(len(indices))]
            r=num
            err = (u[r].dot(v[:,c]) -  dwf[r,c])
            #delaying the update to u[r] so v[:,c ] can be updated with the unchanged u[r]
            tmpur = u[r] - rate *(err*v[:,c]-  lambdaID.dot(u[r] ))
            v[:,c] -= rate * (err*u[r]  - lambdaID.dot(v[:,c] ) )
            u[r] = tmpur
            errsum += err**2
          except:
            # print(err,errsum,rate)
            cont = False
        t+=1
        loss.append(np.sqrt(errsum/(num+1)))
        rate *= convs[0] if loss[-1] < loss[-2] else   convs[1]
        # if t%5 == 0:
          # print(t,loss[-2:],rate, np.mean(loss))
        
        if np.abs(loss[-1]-loss[-2])  < 1e-2 :
          cont = False
          break
        if loss[0] == 1e10:
          del(loss[0])
    runtime = time.time()-starttime
    # if np.mean(loss[-6:]) > 100:
    #print(loss[-6:])
    print("When K = " + str(k) + ":" )
    print("U shape: ", u.shape)
    print("V shape: " , v.shape)
    print("Runtime: " ,runtime)
    print("Iterations: ", t)
    #print(rate)
    
    distances = []
    humandistances = []
    svddistances = []
    for word1, word2, score in humansc:
      #print(word1,word2,score)
      w1row = wordrows[word1] 
      w2row = wordrows[word2]
      humanscore = float(score)
      w1 = u[w1row]
      w2 = u[w2row]
      distance = np.abs(w1.dot(w2.T)) / np.abs(np.linalg.norm(w1)*np.linalg.norm(w2))
      humandistances.append(humanscore)
      distances.append(distance)
    #print(np.std(humandistances),np.std(distances))
    print("PCC: ", np.corrcoef(humandistances,distances)[0][1],"\n")
print("Part 3 and 4 completed")

Part 3 started.
When K = 20:
U shape:  (10076, 20)
V shape:  (20, 489860)
Runtime:  4.481069564819336
Iterations:  13
PCC:  0.1031895798974619 

When K = 50:
U shape:  (10076, 50)
V shape:  (50, 489860)
Runtime:  2.41158390045166
Iterations:  7
PCC:  0.09299918201948869 

When K = 100:
U shape:  (10076, 100)
V shape:  (100, 489860)
Runtime:  7.928662061691284
Iterations:  15
PCC:  0.10087177143856578 

Part 3 and 4 completed


#T-SNE Graph

In [None]:
print("Part 5 started... ")
#graphing
top300 = list(wordrows.items())[:300]
vecs = []
wordkeys = []
for word,row in top300:
  vecs.append(u[row])
  wordkeys.append(word)
vecs = np.asarray(vecs)
plt.figure(figsize=(20, 15), dpi=80)
tsne = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Y = tsne.fit_transform(vecs)
x_coords = Y[:, 0]
y_coords = Y[:, 1]
plt.scatter(x_coords, y_coords)
for label, x, y in zip(wordkeys, x_coords, y_coords):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.show()

print("Part 5 completed")