# 1. Import Library

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from TextPreprocessing import text_preprocessing

# 2. Check Data

In [3]:
# Read data
pd.set_option('display.max_columns', None)

# data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ')
data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ', engine='openpyxl')
data

Unnamed: 0,sn,Question,Long_Answer,Short_Answer,Source,Remarks
0,1,What are Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,http://birchtreecenter.org/learn/autism,
1,2,How common is autism?,According to a 2020 report commissioned by the...,,http://birchtreecenter.org/learn/autism,
2,3,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,http://birchtreecenter.org/learn/autism,
3,4,Why doesn’t intervention center refer to its s...,Our students are children or youth who are cha...,,http://birchtreecenter.org/learn/autism,
4,5,What are the types of Autism Spectrum Disorders?,Autistic Disorder; Asperger Syndrome; Pervasiv...,,http://dhss.alaska.gov/dph/wcfh/Pages/autism/s...,
...,...,...,...,...,...,...
221,222,Do people with an autism spectrum disorder alw...,The level of intellectual functioning is extre...,,https://www.who.int/news-room/q-a-detail/quest...,
222,223,How early can an autism spectrum disorder be r...,Identifying an autism spectrum disorder is dif...,,https://www.who.int/news-room/q-a-detail/quest...,
223,224,What can parents do to help their child with a...,Parents have an essential role in providing su...,,https://www.who.int/news-room/q-a-detail/quest...,
224,225,What causes autism spectrum disorders?,Scientific evidence suggests that various fact...,,https://www.who.int/news-room/q-a-detail/quest...,


# 3. Data Preprocessing

In [4]:
# Select long_answer from the data
long_answer = data.get('Long_Answer')

# Preprocess the long_answer
long_answer = long_answer.map(lambda x: ' '.join(text_preprocessing(x)))

# Vectorize the answers (one-hot), fit_transform() for getting the tf-idf result
sparse_vectorizer = CountVectorizer(strip_accents = 'unicode')
sparse_vectors = sparse_vectorizer.fit_transform(long_answer)
print(sparse_vectors.shape)

(226, 2753)


# 4. Build Topic Model using LDA

In [15]:
# Your super power to define number of topics
n_topics = 4

# Run LDA to generate topics/clusters
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=1000,
                                learning_method='online',
                                random_state=0)

lda.fit(sparse_vectors)

LatentDirichletAllocation(learning_method='online', max_iter=1000,
                          n_components=4, random_state=0)

# 5. Display the resulting topics/clusters of ASD FAQ's Long_Answer field

In [20]:
# Print the top-n key words
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

# Show the first n_top_words key words
n_top_words = 10
feature_names = sparse_vectorizer.get_feature_names()
print_top_words(lda, feature_names, n_top_words)

# test the first document
print("1st document(long FAQ answer) belongs to Topic",lda.transform(sparse_vectors[0]).argmax()+1)


Topic 0:
autism disorder asd child may spectrum people behavior social cause
Topic 1:
child autism may treatment intervention diagnosis early help parent therapy
Topic 2:
institute national autism tel information health disorder fax behavior md
Topic 3:
ability assessment behaviour concern specific characteristic academic ot memory diet

1st document(long FAQ answer) belongs to Topic 1
