In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from nltk.tokenize.casual import casual_tokenize

In [2]:
topic = {}
tfidf = dict(list(zip('cat dog apple lion NYC love'.split(), [1, 1, 1, 1, 1, 1])))
topic['pet'] = (.3 * tfidf['cat'] + .3 * tfidf['dog'] + 0 * tfidf['apple']
                + 0 * tfidf['lion'] - .2 * tfidf['NYC'] + .2 * tfidf['love'])
topic['animal'] = (.1 * tfidf['cat'] + .1 * tfidf['dog'] - .1 * tfidf['apple']
                   + .5 * tfidf['lion'] + .1 * tfidf['NYC'] - .1 * tfidf['love'])
topic['city'] = (0 * tfidf['cat'] - .1 * tfidf['dog'] + .2 * tfidf['apple']
                 - .1 * tfidf['lion'] + .5 * tfidf['NYC'] + .1 * tfidf['love'])


word_vector = {}
word_vector['cat'] = .3 * topic['pet'] + .1 * topic['animal'] + 0 * topic['city']
word_vector['dog'] = .3 * topic['pet'] + .1 * topic['animal'] - .1 * topic['city']
word_vector['apple'] = 0 * topic['pet'] - .1 * topic['animal'] + .2 * topic['city']
word_vector['lion'] = 0 * topic['pet'] + .5 * topic['animal'] - .1 * topic['city']
word_vector['NYC'] = -.2 * topic['pet'] + .1 * topic['animal'] + .5 * topic['city']
word_vector['love'] = .2 * topic['pet'] - .1 * topic['animal'] + .1 * topic['city']

### 4.1.5 LDA

In [3]:
sms = pd.read_csv('sms-spam.csv')
sms.head(6)

Unnamed: 0.1,Unnamed: 0,spam,text
0,0,0,"Go until jurong point, crazy.. Available only ..."
1,1,0,Ok lar... Joking wif u oni...
2,2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,0,U dun say so early hor... U c already then say...
4,4,0,"Nah I don't think he goes to usf, he lives aro..."
5,5,1,FreeMsg Hey there darling it's been 3 week's n...


In [4]:
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(int)

In [5]:
sms.head(6)

Unnamed: 0.1,Unnamed: 0,spam,text
sms0,0,0,"Go until jurong point, crazy.. Available only ..."
sms1,1,0,Ok lar... Joking wif u oni...
sms2!,2,1,Free entry in 2 a wkly comp to win FA Cup fina...
sms3,3,0,U dun say so early hor... U c already then say...
sms4,4,0,"Nah I don't think he goes to usf, he lives aro..."
sms5!,5,1,FreeMsg Hey there darling it's been 3 week's n...


In [14]:
sms = sms.drop(columns="Unnamed: 0", axis = 1)

In [15]:
sms.head()

Unnamed: 0,spam,text
sms0,0,"Go until jurong point, crazy.. Available only ..."
sms1,0,Ok lar... Joking wif u oni...
sms2!,1,Free entry in 2 a wkly comp to win FA Cup fina...
sms3,0,U dun say so early hor... U c already then say...
sms4,0,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
print(tfidf_docs.shape)
print(sms.spam.sum())

(4837, 9232)
638


In [18]:
tfidf_docs= pd.DataFrame(tfidf_docs)

In [19]:
mask = sms.spam.astype(bool).values  # <1>
spam_centroid = tfidf_docs[mask].mean(axis=0) # <2>
ham_centroid = tfidf_docs[~mask].mean(axis=0)

In [20]:
print(spam_centroid.round(2))
print(ham_centroid.round(2))

0       0.06
1       0.00
2       0.00
3       0.00
4       0.00
        ... 
9227    0.00
9228    0.00
9229    0.00
9230    0.00
9231    0.00
Length: 9232, dtype: float64
0       0.02
1       0.01
2       0.00
3       0.00
4       0.00
        ... 
9227    0.00
9228    0.00
9229    0.00
9230    0.00
9231    0.00
Length: 9232, dtype: float64


In [21]:
spamminess_score = tfidf_docs.dot(spam_centroid - ham_centroid)
print(spamminess_score.round(2))

0      -0.01
1      -0.02
2       0.04
3      -0.02
4      -0.01
        ... 
4832    0.05
4833   -0.01
4834   -0.01
4835   -0.00
4836    0.00
Length: 4837, dtype: float64


In [22]:
from sklearn.preprocessing import MinMaxScaler
sms['lda_score'] = MinMaxScaler().fit_transform(np.array(spamminess_score).reshape(-1, 1))
sms['lda_predict'] = (sms['lda_score'] > 0.5).astype(int)

In [23]:
sms['spam lda_predict lda_score'.split()].round(2).head(6)

Unnamed: 0,spam,lda_predict,lda_score
sms0,0,0,0.23
sms1,0,0,0.18
sms2!,1,1,0.72
sms3,0,0,0.18
sms4,0,0,0.29
sms5!,1,1,0.55


In [24]:
(1 - (sms['spam'] - sms['lda_predict']).abs().sum()/len(sms)).round(3)

0.977

### 4.2.1 

In [25]:
from ch04_catdog_lsa_3x6x16 import word_topic_vectors

In [26]:
word_topic_vectors.T.round(1)

Unnamed: 0,cat,dog,apple,lion,nyc,love
top0,-0.6,-0.4,0.5,-0.3,0.4,-0.1
top1,-0.1,-0.3,-0.4,-0.1,0.1,0.8
top2,-0.3,0.8,-0.1,-0.5,0.0,0.1


### 4.3

In [27]:
from ch04_catdog_lsa_sorted import lsa_models, prettify_tdm

In [28]:
bow_svd, tfidf_svd = lsa_models()

In [29]:
prettify_tdm(**bow_svd)

Unnamed: 0,cat,dog,apple,lion,nyc,love,text
0,,,1.0,,1.0,,NYC is the Big Apple.\n
1,,,1.0,,1.0,,NYC is known as the Big Apple.\n
2,,,,,1.0,1.0,I love NYC!\n
3,,,1.0,,1.0,,I wore a hat to the Big Apple party in NYC.\n
4,,,1.0,,1.0,,Come to NYC. See the Big Apple!\n
5,,,1.0,,,,Manhattan is called the Big Apple.\n
6,1.0,,,,,,New York is a big city for a small cat.\n
7,1.0,,,1.0,,,"The lion, a big cat, is the king of the jungle.\n"
8,1.0,,,,,1.0,I love my pet cat.\n
9,,,,,1.0,1.0,I love New York City (NYC).\n


In [30]:
tdm = bow_svd["tdm"]
tdm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
cat,0,0,0,0,0,0,1,1,1,0,1
dog,0,0,0,0,0,0,0,0,0,0,1
apple,1,1,0,1,1,1,0,0,0,0,0
lion,0,0,0,0,0,0,0,1,0,0,0
nyc,1,1,1,1,1,0,0,0,0,1,0
love,0,0,1,0,0,0,0,0,1,1,0


### 4.3.1

In [31]:
U, s, Vt = np.linalg.svd(tdm)
pd.DataFrame(U, index = tdm.index).round(2)

Unnamed: 0,0,1,2,3,4,5
cat,-0.04,0.83,-0.38,0.0,0.11,-0.38
dog,-0.0,0.21,-0.18,-0.71,-0.39,0.52
apple,-0.62,-0.21,-0.51,0.0,0.49,0.27
lion,-0.0,0.21,-0.18,0.71,-0.39,0.52
nyc,-0.75,-0.0,0.24,-0.0,-0.52,-0.32
love,-0.22,0.42,0.69,0.0,0.41,0.37


### 4.3.2

In [32]:
s.round(1)

array([3.1, 2.2, 1.8, 1. , 0.8, 0.5])

In [33]:
S = np.zeros((len(U), len(Vt)))

In [34]:
np.fill_diagonal(S, s)

In [35]:
pd.DataFrame(S).round(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,3.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


### 4.3.3

In [36]:
pd.DataFrame(Vt).round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.44,-0.44,-0.31,-0.44,-0.44,-0.2,-0.01,-0.01,-0.08,-0.31,-0.01
1,-0.09,-0.09,0.19,-0.09,-0.09,-0.09,0.37,0.47,0.56,0.19,0.47
2,-0.16,-0.16,0.52,-0.16,-0.16,-0.29,-0.22,-0.32,0.17,0.52,-0.32
3,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.71,0.0,-0.0,-0.71
4,-0.04,-0.04,-0.14,-0.04,-0.04,0.58,0.13,-0.33,0.62,-0.14,-0.33
5,-0.09,-0.09,0.1,-0.09,-0.09,0.51,-0.73,0.27,-0.01,0.1,0.27
6,-0.57,0.21,0.11,0.33,-0.31,0.34,0.34,-0.0,-0.34,0.23,0.0
7,-0.32,0.47,0.25,-0.63,0.41,0.07,0.07,0.0,-0.07,-0.18,0.0
8,-0.5,0.29,-0.2,0.41,0.16,-0.37,-0.37,-0.0,0.37,-0.17,0.0
9,-0.15,-0.15,-0.59,-0.15,0.42,0.04,0.04,0.0,-0.04,0.63,-0.0


### 4.3.5

In [37]:
err = []
for numdim in range(len(s), 0, -1):
    S[numdim -1, numdim -1] = 0
    reconstructed_tdm = U.dot(S).dot(Vt)
    err.append(np.sqrt(((reconstructed_tdm-tdm).values.flatten() **2 ).sum()/np.product(tdm.shape)))
np.array(err).round(2)

array([0.06, 0.12, 0.17, 0.28, 0.39, 0.55])

### 4.4.3

In [38]:
sms.text

sms0        Go until jurong point, crazy.. Available only in bu...
sms1                                 Ok lar... Joking wif u oni...
sms2!       Free entry in 2 a wkly comp to win FA Cup final tkt...
sms3             U dun say so early hor... U c already then say...
sms4        Nah I don't think he goes to usf, he lives around h...
                                     ...                          
sms4832!    This is the 2nd time we have tried 2 contact u. U h...
sms4833                       Will ü b going to esplanade fr home?
sms4834     Pity, * was in mood for that. So...any other sugges...
sms4835     The guy did some bitching but I acted like i'd be i...
sms4836                                 Rofl. Its true to its name
Name: text, Length: 4837, dtype: object

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
len(tfidf.vocabulary_)
tfidf_docs = pd.DataFrame(tfidf_docs)
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape

(4837, 9232)

In [40]:
pca = PCA(n_components=16)
pca = pca.fit(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=['topic{}'.format(i) for i in range(16)])
pca_topic_vectors.head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
0,0.201169,0.002772,0.037246,0.010981,-0.019197,-0.052956,0.039176,-0.068414,0.014535,-0.079746,0.007342,-0.007829,-0.021292,-0.03331,-0.018343,-0.022555
1,0.404379,-0.093865,-0.077507,0.050903,0.100192,0.047127,0.023059,0.062786,0.021575,-0.026032,-0.003176,0.041759,0.020832,-0.035796,0.051283,0.041663
2,-0.030459,-0.04812,0.090191,-0.067069,0.090572,-0.043113,-0.000502,0.001959,-0.057626,0.047617,0.124433,0.026078,0.014737,-0.032747,-0.048404,-0.021217
3,0.329045,-0.032753,-0.034559,-0.015787,0.052416,0.055697,-0.165412,-0.080526,0.065859,-0.099905,0.022628,0.038046,0.027765,-0.066588,0.019317,0.030185
4,0.002158,0.030871,0.038315,0.03388,-0.074356,-0.092458,-0.043262,0.059647,-0.045984,0.027705,0.029799,-0.005667,0.035073,0.021629,-0.083432,0.02212


In [41]:
tfidf.vocabulary_

{'go': 3807,
 'until': 8487,
 'jurong': 4675,
 'point': 6296,
 ',': 13,
 'crazy': 2549,
 '..': 21,
 'available': 1531,
 'only': 5910,
 'in': 4396,
 'bugis': 1973,
 'n': 5594,
 'great': 3894,
 'world': 8977,
 'la': 4811,
 'e': 3056,
 'buffet': 1971,
 '...': 25,
 'cine': 2277,
 'there': 8071,
 'got': 3855,
 'amore': 1296,
 'wat': 8736,
 'ok': 5874,
 'lar': 4848,
 'joking': 4642,
 'wif': 8875,
 'u': 8395,
 'oni': 5906,
 'free': 3604,
 'entry': 3195,
 '2': 471,
 'a': 1054,
 'wkly': 8933,
 'comp': 2386,
 'to': 8192,
 'win': 8890,
 'fa': 3328,
 'cup': 2608,
 'final': 3450,
 'tkts': 8180,
 '21st': 497,
 'may': 5272,
 '2005': 487,
 '.': 15,
 'text': 8020,
 '87121': 948,
 'receive': 6688,
 'question': 6574,
 '(': 9,
 'std': 7651,
 'txt': 8379,
 'rate': 6628,
 ')': 10,
 't': 7889,
 '&': 7,
 "c's": 2020,
 'apply': 1383,
 '08452810075': 115,
 'over': 6003,
 '18': 438,
 "'": 8,
 's': 6959,
 'dun': 3041,
 'say': 7034,
 'so': 7438,
 'early': 3069,
 'hor': 4207,
 'c': 2019,
 'already': 1268,
 'then': 

In [42]:
column_nums, terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))

In [43]:
weights = pd.DataFrame(pca.components_, columns = terms, index = ["topic{}".format(i) for i in range(16)])

In [44]:
pd.options.display.max_columns = 8
weights.head(4).round(3)

Unnamed: 0,!,"""",#,#150,...,…,┾,〨ud,鈥
topic0,-0.071,0.008,-0.001,-0.0,...,-0.002,0.001,0.001,0.001
topic1,0.064,0.008,0.0,-0.0,...,0.003,0.001,0.001,0.001
topic2,0.071,0.027,0.0,0.001,...,0.002,-0.001,-0.001,-0.001
topic3,-0.059,-0.032,-0.001,-0.0,...,0.001,0.001,0.001,0.001


In [45]:
pd.options.display.max_columns = 12
deals = weights['! ;) :) half off free crazy deal only $ 80 %'.split()].round(3)*100

In [46]:
deals

Unnamed: 0,!,;),:),half,off,free,crazy,deal,only,$,80,%
topic0,-7.1,0.1,-0.5,-0.0,-0.4,-2.0,-0.0,-0.1,-2.2,0.3,-0.0,-0.0
topic1,6.4,0.0,7.4,0.1,0.4,-2.3,-0.2,-0.1,-3.8,-0.1,-0.0,-0.2
topic2,7.1,0.2,-0.1,0.0,0.3,4.4,0.1,-0.1,0.7,0.0,0.0,0.1
topic3,-5.9,-0.3,-7.1,0.2,0.3,-0.2,0.0,0.1,-2.3,0.1,-0.1,-0.3
topic4,38.1,-0.1,-12.5,-0.1,-0.2,9.8,0.1,-0.2,3.0,0.3,0.1,-0.1
topic5,-26.5,0.1,-1.6,-0.3,-0.7,-1.4,-0.6,-0.2,-1.8,-0.9,0.0,0.0
topic6,-10.9,-0.5,19.8,-0.4,-0.9,-0.6,-0.2,-0.1,-1.4,-0.0,-0.0,-0.1
topic7,17.9,0.1,-16.4,0.8,0.8,-2.4,0.0,0.0,-1.6,-0.4,0.0,-0.1
topic8,33.8,0.1,5.0,-0.5,-0.5,0.2,-0.4,-0.4,3.2,-0.6,-0.0,-0.2
topic9,6.4,-0.3,16.6,1.5,-1.0,6.5,-0.6,-0.4,3.3,-0.4,-0.0,0.1


In [47]:
deals.T.sum()

topic0    -11.9
topic1      7.6
topic2     12.7
topic3    -15.5
topic4     38.2
topic5    -33.9
topic6      4.7
topic7     -1.3
topic8     39.7
topic9     31.7
topic10   -30.0
topic11    46.8
topic12    38.2
topic13    29.9
topic14    24.6
topic15   -10.5
dtype: float64

In [48]:
tfidf_docs

Unnamed: 0,0,1,2,3,4,5,...,9226,9227,9228,9229,9230,9231
0,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
1,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
2,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
3,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
4,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4832,0.063691,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
4833,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
4834,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055
4835,-0.025643,-0.00584,-0.000228,-0.000053,-0.000156,-0.000943,...,-0.000148,-0.000099,-0.00066,-0.000055,-0.000055,-0.000055


In [49]:
n = 9232
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=16, n_iter=100)
# tfidf_docs.values
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, index=['sms{}{}'.format(i, '!' * j) for (i, j) in zip(range(n), sms.spam)],columns=['topic{}'.format(i) for i in range(16)])
svd_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.201,0.003,0.037,0.011,-0.019,-0.053,...,0.007,-0.007,0.002,-0.036,-0.014,0.037
sms1,0.404,-0.094,-0.078,0.051,0.1,0.047,...,-0.004,0.036,0.043,-0.021,0.051,-0.042
sms2!,-0.03,-0.048,0.09,-0.067,0.091,-0.043,...,0.125,0.023,0.026,-0.02,-0.042,0.052
sms3,0.329,-0.033,-0.035,-0.016,0.052,0.056,...,0.022,0.023,0.073,-0.046,0.022,-0.07
sms4,0.002,0.031,0.038,0.034,-0.075,-0.093,...,0.028,-0.009,0.027,0.034,-0.083,-0.021
sms5!,-0.016,0.059,0.014,-0.006,0.122,-0.04,...,0.041,0.055,-0.037,0.075,-0.001,0.02


In [50]:
import numpy as np
svd_topic_vectors = (svd_topic_vectors.T/np.linalg.norm(svd_topic_vectors, axis = 1)).T
svd_topic_vectors.iloc[:10].dot(svd_topic_vectors.iloc[:10].T).round(1)

Unnamed: 0,sms0,sms1,sms2!,sms3,sms4,sms5!,sms6,sms7,sms8!,sms9!
sms0,1.0,0.6,-0.1,0.6,-0.0,-0.3,-0.3,-0.1,-0.3,-0.3
sms1,0.6,1.0,-0.2,0.8,-0.2,0.0,-0.2,-0.2,-0.1,-0.1
sms2!,-0.1,-0.2,1.0,-0.2,0.1,0.4,0.0,0.3,0.5,0.4
sms3,0.6,0.8,-0.2,1.0,-0.2,-0.3,-0.1,-0.3,-0.2,-0.1
sms4,-0.0,-0.2,0.1,-0.2,1.0,0.2,0.0,0.1,-0.4,-0.2
sms5!,-0.3,0.0,0.4,-0.3,0.2,1.0,-0.1,0.1,0.3,0.4
sms6,-0.3,-0.2,0.0,-0.1,0.0,-0.1,1.0,0.1,-0.2,-0.2
sms7,-0.1,-0.2,0.3,-0.3,0.1,0.1,0.1,1.0,0.1,0.4
sms8!,-0.3,-0.1,0.5,-0.2,-0.4,0.3,-0.2,0.1,1.0,0.3
sms9!,-0.3,-0.1,0.4,-0.1,-0.2,0.4,-0.2,0.4,0.3,1.0


### 4.5.1

In [51]:
total_corpus_len = 0
for document_text in sms.text:
    total_corpus_len = total_corpus_len + len(casual_tokenize(document_text))
mean_document_len = total_corpus_len/ len(sms)
round(mean_document_len, 2)

21.35

In [52]:
sum([len(casual_tokenize(t)) for t in sms.text]) * 1. / len(sms.text)

21.34794293983874

### 4.5.2

In [53]:
index = ['sms{}{}'.format(i, '!'*j) for (i,j) in zip(range(len(sms)), sms.spam)]

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import casual_tokenize
np.random.seed(42)
counter = CountVectorizer(tokenizer=casual_tokenize)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=sms.text).toarray(), index=index)
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(),counter.vocabulary_.keys())))
bow_docs.columns = terms

In [56]:
sms.loc['sms0'].text

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [57]:
bow_docs.loc['sms0'][bow_docs.loc['sms0'] > 0].head()

,            1
..           1
...          2
amore        1
available    1
Name: sms0, dtype: int64

In [58]:
from sklearn.decomposition import LatentDirichletAllocation as LDiA
ldia = LDiA(n_components=16, learning_method='batch')
ldia = ldia.fit(bow_docs)
ldia.components_.shape

(16, 9232)

In [59]:
columns = ['topic{}'.format(i) for i in range(pca.n_components)]
pd.set_option('display.width', 75)
components = pd.DataFrame(ldia.components_.T, index=terms,columns=columns)
components.round(2).head(3)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic10,topic11,topic12,topic13,topic14,topic15
!,184.03,15.0,72.22,394.95,45.48,36.14,...,37.42,44.18,64.4,297.29,41.16,11.7
"""",0.68,4.22,2.41,0.06,152.35,0.06,...,8.42,11.42,0.07,62.72,12.27,0.06
#,0.06,0.06,0.06,0.06,0.06,2.07,...,0.06,0.06,1.07,4.05,0.06,0.06


In [60]:
components.topic3.sort_values(ascending=False)[:10]

!       394.952246
.       218.049724
to      119.533134
u       118.857546
call    111.948541
£       107.358914
,        96.954384
*        90.314783
your     90.215961
is       75.750037
Name: topic3, dtype: float64

In [61]:
ldia16_topic_vectors = ldia.transform(bow_docs)
ldia16_topic_vectors = pd.DataFrame(ldia16_topic_vectors,index=index, columns=columns)
ldia16_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.0,0.62,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms1,0.01,0.01,0.01,0.01,0.01,0.01,...,0.01,0.12,0.01,0.01,0.01,0.01
sms2!,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms3,0.0,0.0,0.0,0.0,0.09,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms4,0.39,0.0,0.33,0.0,0.0,0.0,...,0.0,0.0,0.09,0.0,0.0,0.0


In [62]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
X_train, X_test, y_train, y_test =train_test_split(ldia16_topic_vectors, sms.spam, test_size=0.5,random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia16_spam'] = lda.predict(ldia16_topic_vectors)
round(float(lda.score(X_test, y_test)), 2)

0.94

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
tfidf_docs = tfidf_docs - tfidf_docs.mean(axis=0)
X_train, X_test, y_train, y_test = train_test_split(tfidf_docs,sms.spam.values, test_size=0.5, random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
round(float(lda.score(X_train, y_train)), 3)

1.0

In [64]:
round(float(lda.score(X_test, y_test)), 3)

0.748

In [65]:
ldia32 = LDiA(n_components=32, learning_method='batch')
ldia32 = ldia32.fit(bow_docs)
ldia32.components_.shape

(32, 9232)

In [66]:
ldia32_topic_vectors = ldia32.transform(bow_docs)
columns32 = ['topic{}'.format(i) for i in range(ldia32.n_components)]
ldia32_topic_vectors = pd.DataFrame(ldia32_topic_vectors, index=index,columns=columns32)
ldia32_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,...,topic26,topic27,topic28,topic29,topic30,topic31
sms0,0.0,0.0,0.0,0.06,0.14,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms1,0.0,0.0,0.0,0.0,0.53,0.0,...,0.0,0.0,0.0,0.14,0.0,0.0
sms2!,0.0,0.0,0.0,0.0,0.0,0.65,...,0.0,0.0,0.0,0.0,0.0,0.0
sms3,0.0,0.11,0.0,0.0,0.39,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0
sms4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.47,0.0,0.0,0.0,0.0


In [67]:
X_train, X_test, y_train, y_test =train_test_split(ldia32_topic_vectors, sms.spam, test_size=0.5,random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia32_spam'] = lda.predict(ldia32_topic_vectors)
X_train.shape

(2418, 32)

In [68]:
round(float(lda.score(X_train, y_train)), 3)

0.933

In [69]:
round(float(lda.score(X_test, y_test)), 3)

0.936

In [70]:
'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis',
'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard',
'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
'yule'

'yule'

In [71]:
lda = LDA(n_components=1)
lda = lda.fit(tfidf_docs, sms.spam)
sms['lda_spaminess'] = lda.predict(tfidf_docs)
((sms.spam - sms.lda_spaminess) ** 2.).sum() ** .5

0.0

In [72]:
(sms.spam == sms.lda_spaminess).sum()

4837

In [73]:
len(sms)

4837

In [74]:
from sklearn.model_selection import cross_val_score
lda = LDA(n_components=1)
scores = cross_val_score(lda, tfidf_docs, sms.spam, cv=5)
"Accuracy: {:.2f} (+/-{:.2f})".format(scores.mean(), scores.std() * 2)

'Accuracy: 0.77 (+/-0.02)'

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_docs,sms.spam, test_size=0.33, random_state=271828)
lda = LDA(n_components=1)
lda.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=1, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [76]:
lda.score(X_test, y_test).round(3)

0.764

In [77]:
lda = LDA(n_components=1)
scores = cross_val_score(lda, pca_topic_vectors, sms.spam, cv=10)
"Accuracy: {:.3f} (+/-{:.3f})".format(scores.mean(), scores.std() * 2)

'Accuracy: 0.957 (+/-0.022)'