In [1]:
import pandas as pd
from gensim import corpora, models
import re
from string import punctuation
from nltk.corpus import stopwords

In [2]:
#read in CSV file
survey_response = pd.read_csv('survey_response.csv', encoding='ISO-8859-1')

In [3]:
comments = survey_response['comment'].tolist()

In [4]:

def process_text(text):
    # Typecast to string if text is not already a string
    if not isinstance(text, str):
        text = str(text)
        
    # Load English stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase for case folding
    text = text.lower()
    
    # Remove punctuation using a regular expression
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text by splitting on whitespace
    tokens = text.split()
    
    # Remove stop words from tokens
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    return filtered_tokens

In [5]:
# puts each comment through the process_text function
processed_comments = [process_text(comment) for comment in comments]

In [7]:
# Create a dictionary from the processed comments
dictionary = corpora.Dictionary(processed_comments)

# Convert the dictionary to a Bag-of-Words corpus
corpus = [dictionary.doc2bow(comment) for comment in processed_comments]

# Apply the LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Print the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.056*"flight" + 0.030*"delays" + 0.022*"would" + 0.016*"communication"')
(1, '0.140*"friendly" + 0.138*"staff" + 0.093*"flight" + 0.064*"time"')
(2, '0.163*"service" + 0.136*"good" + 0.108*"great" + 0.096*"price"')
(3, '0.021*"us" + 0.017*"terminal" + 0.016*"get" + 0.015*"back"')
(4, '0.127*"everything" + 0.103*"went" + 0.073*"well" + 0.045*"trip"')
(5, '0.054*"flight" + 0.026*"delayed" + 0.022*"hour" + 0.022*"plane"')
(6, '0.055*"seats" + 0.051*"seat" + 0.016*"bag" + 0.016*"room"')
(7, '0.114*"flight" + 0.042*"attendants" + 0.037*"plane" + 0.036*"attendant"')
(8, '0.098*"sun" + 0.093*"country" + 0.038*"fly" + 0.035*"always"')
(9, '0.153*"time" + 0.082*"flight" + 0.065*"nice" + 0.050*"flights"')
