In [26]:
# create spam filter using following setup:
# 'sms-spam.csv' contains spam content labeled by 0 (no spam) and 1 (spam)
# do the following steps:
# 1. read the csv file
# 2. make tf-idf vectors
# 3. make lsa using pca
# 4. use lda classifier for spam classification - make cross validation

import pandas as pd
pd.options.display.width = 120
sms = pd.read_csv('sms-spam.csv')
index = ['sms{}{}'.format(i,'!'*j) for (i,j) in zip(range(len(sms)),sms.spam)]
print("step 1 = ", sms.head(5))

# make tf-idf vectors
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
print("step 2 = ", tfidf_docs)

# do lsa using pca
from sklearn.decomposition import PCA
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, index=index, columns=columns)
print("step 3 = ", pca_topic_vectors.round(3).head(6))

# do lda with validation
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
scores = cross_val_score(lda, pca_topic_vectors, sms.spam, cv=10)
print("step 4 = ", "Accuracy: {:.3f} (+/-{:.3f})".format(scores.mean(), scores.std()*2))

step 1 =     Unnamed: 0  spam                                               text
0           0     0  Go until jurong point, crazy.. Available only ...
1           1     0                      Ok lar... Joking wif u oni...
2           2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3           3     0  U dun say so early hor... U c already then say...
4           4     0  Nah I don't think he goes to usf, he lives aro...
step 2 =  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
step 3 =         topic0  topic1  topic2  topic3  topic4  topic5  topic6  topic7  topic8  topic9  topic10  topic11  topic12  \
sms0    0.201   0.003   0.037   0.011  -0.019  -0.053   0.039  -0.066   0.013  -0.083    0.010   -0.002    0.001   
sms1    0.404  -0.094  -0.078   0.051   0.100   0.047   0.023   0.066   0.021  -0.023   -0.005    0.035    0.040   
sms2!  -0.030  -0.048   0.090  -0.067   

In [52]:
# LDiA
# do the same as above but with LDiA, however use Latent Dirichlet Allocation (LDA) instead of PCA

# make tf-idf vectors
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
print("step 2 = ", tfidf_docs)

# do lsa using pca
from sklearn.decomposition import LatentDirichletAllocation as LDiA
ldia = LDiA(n_components=16)
ldia = ldia.fit(tfidf_docs)
ldia_topic_vectors = ldia.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(ldia.n_components)]
ldia_topic_vectors = pd.DataFrame(ldia_topic_vectors, index=index, columns=columns)
print("step 3 = ", ldia_topic_vectors.round(3).head(6))

# do lda with validation
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
scores = cross_val_score(lda, ldia_topic_vectors, sms.spam, cv=10)
print("step 4 = ", "Accuracy: {:.3f} (+/-{:.3f})".format(scores.mean(), scores.std()*2))


step 2 =  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
step 3 =         topic0  topic1  topic2  topic3  topic4  topic5  topic6  topic7  topic8  topic9  topic10  topic11  topic12  \
sms0    0.354   0.011   0.011   0.011   0.286   0.011   0.011   0.011   0.011   0.011    0.011    0.011    0.011   
sms1    0.347   0.018   0.018   0.018   0.018   0.018   0.018   0.406   0.018   0.018    0.018    0.018    0.018   
sms2!   0.010   0.010   0.503   0.010   0.010   0.010   0.362   0.010   0.010   0.010    0.010    0.010    0.010   
sms3    0.572   0.016   0.016   0.016   0.016   0.016   0.016   0.016   0.016   0.123    0.016    0.016    0.016   
sms4    0.014   0.014   0.014   0.014   0.014   0.014   0.178   0.014   0.014   0.014    0.014    0.014    0.014   
sms5!   0.009   0.009   0.009   0.009   0.009   0.009   0.009   0.009   0.009   0.009    0.009    0.009    0.009   

       topi

In [55]:
# write toy model of semantic search using following setup:
# 1. read corpus consistng of Wikipedia info as 2 separate files
# YorkAustralia.txt and NewYork.txt
# 2. make tf-idf vectors for corpus
# 3. do 1 topic analysis - it will be New Yourk'iness or York'iness
# 4. make question 'What is biggest city?'
# make tf-idf vector for question
# do pca transform
# 5. check which topic/document is closer to question

import pandas as pd
pd.options.display.width = 120
import numpy as np
from nltk.tokenize.casual import casual_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# read corpus
with open('YorkAustralia.txt', 'r') as f:
    york = f.read()
with open('NewYork.txt', 'r') as f:
    newyork = f.read()
corpus = [york, newyork]
index = ['york', 'newyork']

# make tf-idf vectors
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=corpus).toarray()
print("step 2 = ", tfidf_docs)

# do lsa using pca
pca = PCA(n_components=1)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, index=index, columns=columns)
print("step 3 = ", pca_topic_vectors.round(3).head(6))

# make question
question = 'What is biggest city?'

# make tf-idf vector for question
tfidf_question = tfidf.transform([question]).toarray()
print("step 4 = ", tfidf_question)

# do pca transform
pca_question = pca.transform(tfidf_question)
print("step 5 = ", pca_question)

# check which topic/document is closer to question
print("step 6 = ", np.linalg.norm(pca_question - pca_topic_vectors))


step 2 =  [[0.03005694 0.03005694 0.45085413 0.2103986  0.04224398 0.04224398
  0.04224398 0.04224398 0.04224398 0.         0.04224398 0.
  0.         0.         0.         0.04224398 0.         0.
  0.04224398 0.03005694 0.08448797 0.04224398 0.         0.03005694
  0.15028471 0.         0.03005694 0.03005694 0.03005694 0.
  0.04224398 0.04224398 0.04224398 0.08448797 0.04224398 0.04224398
  0.         0.04224398 0.04224398 0.04224398 0.08448797 0.09017083
  0.         0.         0.04224398 0.         0.04224398 0.
  0.04224398 0.         0.         0.04224398 0.         0.
  0.         0.         0.         0.         0.         0.08448797
  0.04224398 0.         0.04224398 0.         0.04224398 0.04224398
  0.04224398 0.04224398 0.04224398 0.         0.04224398 0.
  0.08448797 0.06011388 0.         0.04224398 0.04224398 0.04224398
  0.04224398 0.         0.         0.04224398 0.04224398 0.
  0.04224398 0.03005694 0.24045554 0.         0.04224398 0.
  0.06011388 0.03005694 0.06011388