In [3]:
import pandas as pd 
import gensim
from gensim import corpora,models

In [4]:
df = pd.read_csv('fashion.csv')
df

Unnamed: 0,year,season,brand,author of review,location,time,review text
0,2016,Spring,A Dtacher,Kristin Anderson,NEW YORK,"September 13, 2015",Detachment was the word of the day at A Dtache...
1,2016,Spring,A.F. Vandevorst,Luke Leitch,PARIS,"October 1, 2015",You heard this collection coming long before y...
2,2016,Spring,A.L.C.,Kristin Anderson,NEW YORK,"September 21, 2015",August saw the announcement of big news for A....
3,2016,Spring,A.P.C.,Nicole Phelps,PARIS,"October 3, 2015","They call me the king of basics, Jean Touitou ..."
4,2016,Spring,A.W.A.K.E.,Maya Singer,NEW YORK,"October 21, 2015",Natalia Alaverdian is a designer with a lot of...
...,...,...,...,...,...,...,...
429,2016,Spring,Zo Jordan,Maya Singer,LONDON,"September 19, 2015","Water, water, everywhere, / nor any drop to dr..."
430,2016,Spring,Zuhair Murad,Amy Verner,PARIS,"October 4, 2015","From a new Paris showroom, Zuhair Murad came a..."
431,2016,Spring,1205,Luke Leitch,LONDON,"September 19, 2015",Fashion and Instagram are such (often sacchari...
432,2016,Spring,3.1 Phillip Lim,Maya Singer,NEW YORK,"September 14, 2015",Let other New York City fashion designers toas...


In [5]:
docs = df['review text'].tolist()
docs[0]

'Detachment was the word of the day at A Dtacher (yes, like the labels name, bien sr). Designer Mona Kowalska loves the high concept, and one imagines that today detachment included being unconcerned with the gaze of others. Kowalskas woman, both as she appears on the runway and the real world, dresses for herself. Her intensely arty bend, and taste for clothes that match it, make A Dtacher a cultishly beloved brand among certain shoppers. This season, Kowalska presented them with a lineup of relatively playful offerings.\rThe collection opened with a pair of midi dresses in an Indonesian-inspired floral print, which reemerged later imagined with allover Pop white polka dots. Elsewhere came cardigans in an uncanny kind of amoxicillin pink that you imagined the A Dtacher woman wearing with tongue firmly in cheek (they had Kawakubo-esque allover holes, to boot). The popcorn knits were pretty fun, too.\rThe choice to use hardier materials lent dresses eccentric volumes, but also led to a 

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    
# Remove stopwords.
docs = [[token for token in doc if token not in stopwords.words('english')] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [7]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [8]:
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=10)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [9]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

In [10]:
docs[0]

['detachment',
 'word',
 'day',
 'dtacher',
 'yes',
 'like',
 'label',
 'name',
 'bien',
 'sr',
 'designer',
 'mona',
 'kowalska',
 'love',
 'high',
 'concept',
 'one',
 'imago',
 'today',
 'detachment',
 'included',
 'unconcerned',
 'gaze',
 'others',
 'kowalskas',
 'woman',
 'appears',
 'runway',
 'real',
 'world',
 'dress',
 'intensely',
 'arty',
 'bend',
 'taste',
 'clothes',
 'match',
 'make',
 'dtacher',
 'cultishly',
 'beloved',
 'brand',
 'among',
 'certain',
 'shopper',
 'season',
 'kowalska',
 'presented',
 'lineup',
 'relatively',
 'playful',
 'offering',
 'collection',
 'opened',
 'pair',
 'midi',
 'dress',
 'indonesian',
 'inspired',
 'floral',
 'print',
 'reemerged',
 'later',
 'imagined',
 'allover',
 'pop',
 'white',
 'polka',
 'dot',
 'elsewhere',
 'came',
 'cardigan',
 'uncanny',
 'kind',
 'amoxicillin',
 'pink',
 'imagined',
 'dtacher',
 'woman',
 'wearing',
 'tongue',
 'firmly',
 'cheek',
 'kawakubo',
 'esque',
 'allover',
 'hole',
 'boot',
 'popcorn',
 'knit',
 'pr

In [11]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 13812
Number of documents: 434


In [12]:
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token.encode('utf8') for (ID,token) in sort_token]

In [13]:
import numpy as np
matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype = 'int')
matrix = matrix.T #transpose the matrix 

#convert the numpy matrix into pandas data frame
matrix_df = pd.DataFrame(matrix, columns=unique_token)

In [14]:
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 100
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

In [15]:
lda = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [16]:
lda = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

lda.print_topics(1) #V matrix, topic matrix
import re
for i,topic in lda.print_topics(10):
    print(f'Top 10 words for topic:')
    print(",".join(re.findall('".*?"',topic)))
    print('\n')

Top 10 words for topic:
"dress","collection","look","designer","show","one","like","woman","new","way"


Top 10 words for topic:
"dress","collection","like","new","one","designer","show","look","season","piece"


Top 10 words for topic:
"one","dress","look","skirt","collection","clothes","price","show","said","fashion"


Top 10 words for topic:
"collection","new","look","designer","one","pajama","spring","like","girl","show"


Top 10 words for topic:
"collection","print","new","one","designer","fashion","skirt","show","jacket","dress"


Top 10 words for topic:
"collection","dress","one","designer","new","like","season","look","piece","show"


Top 10 words for topic:
"dress","collection","show","new","like","woman","one","designer","also","look"


Top 10 words for topic:
"dress","collection","like","one","designer","season","look","new","spring","skirt"


Top 10 words for topic:
"dress","fashion","new","look","collection","show","like","one","piece","also"


Top 10 words for topic:
"dre

In [17]:
top_topics = lda.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.6925.
[([(0.007368192, 'collection'),
   (0.0072387243, 'dress'),
   (0.0057083596, 'one'),
   (0.0050948323, 'designer'),
   (0.004014598, 'new'),
   (0.003943154, 'like'),
   (0.0037891471, 'season'),
   (0.0037601248, 'look'),
   (0.0034064255, 'piece'),
   (0.0034017665, 'show'),
   (0.0033320962, 'print'),
   (0.0031883754, 'spring'),
   (0.0029935937, 'clothes'),
   (0.0029183226, 'fashion'),
   (0.0027469853, 'also'),
   (0.002728212, 'skirt'),
   (0.0026745878, 'came'),
   (0.0026285083, 'way'),
   (0.0025617925, 'made'),
   (0.00255415, 'jacket')],
  -0.8672698489330274),
 ([(0.008715183, 'dress'),
   (0.006274551, 'collection'),
   (0.0057854783, 'show'),
   (0.0051714787, 'look'),
   (0.00513138, 'one'),
   (0.004209192, 'designer'),
   (0.0041447515, 'like'),
   (0.0035183674, 'black'),
   (0.0034818603, 'spring'),
   (0.0031616231, 'came'),
   (0.0029542125, 'season'),
   (0.0029510811, 'clothes'),
   (0.0029410562, 'piece'),
   (0.0028534424, '

In [18]:
corpus_lda = lda[corpus] #transform lda model

#convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10).T

#write U_matrix into pandas dataframe and output
U_matrix_lda_df = pd.DataFrame(U_matrix_lda)
U_matrix_lda_df.to_csv('U_matrix_lda.csv')

In [19]:
print(matrix_df.shape)
print(U_matrix_lda_df.shape)

(434, 13812)
(434, 10)


## LSI MODEL

In [20]:
tfidf = models.TfidfModel(corpus) #fit tfidf model
corpus_tfidf = tfidf[corpus] 

In [21]:
from gensim.models import LsiModel


lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)

import re
for i,topic in lsi.print_topics(1):
    print(f'Top 10 words for topic: ')
    print(",".join(re.findall('".*?"',topic)))
    print('\n')

Top 10 words for topic: 
"show","new","woman","season","print","silk","white","brand","black","jacket"




In [22]:
corpus_lsi = lsi[corpus_tfidf] #transform lda model

#convert corpus_lsi to numpy matrix
U_matrix_lsi = gensim.matutils.corpus2dense(corpus_lsi,num_terms=10).T

#write U_matrix into pandas dataframe and output
pd.DataFrame(U_matrix_lsi).to_csv('U_matrix_lsi.csv')