In [4]:
import pandas as pd
import nltk
import random
import numpy as np
import string
import re
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from gensim import corpora, models

In [5]:
#read in CSV file
survey_response = pd.read_csv('survey_response.csv', encoding='ISO-8859-1')

In [11]:
# Define the list of words to ignore
unwanted_words = {'plane', 'sun', 'country', 'fly', 'flying'}

In [12]:

def process_text(text):
    # Typecast to string if text is not already a string
    if not isinstance(text, str):
        text = str(text)
        
    # Load English stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase for case folding
    text = text.lower()
    
    # Remove punctuation using a regular expression
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text by splitting on whitespace
    tokens = text.split()
    
     # Remove stop words and unwanted words from tokens
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in unwanted_words]
    
    return filtered_tokens

In [14]:

def run_lda_analysis(file_path, num_topics=5, passes=20):
    # Read and preprocess text
    texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for comment in file:
            processed_comment = process_text(comment)
            if processed_comment:  # Check if comment is not empty after preprocessing
                texts.append(processed_comment)
    
    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    return lda_model, corpus, dictionary


In [15]:

# File paths
high_score_file_path = 'high_score_comments.txt'
low_score_file_path = 'low_score_comments.txt'

In [16]:

# Run LDA analysis separately
high_score_lda_model, high_score_corpus, high_score_dictionary = run_lda_analysis(high_score_file_path)
low_score_lda_model, low_score_corpus, low_score_dictionary = run_lda_analysis(low_score_file_path)

In [17]:

# Print topics for high score comments
print("High Score Comments Topics:")
for idx, topic in high_score_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

# Print topics for low score comments
print("Low Score Comments Topics:")
for idx, topic in low_score_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

High Score Comments Topics:
Topic: 0 
Words: 0.021*"seats" + 0.019*"like" + 0.018*"seat" + 0.015*"would" + 0.011*"terminal" + 0.009*"2" + 0.008*"bag" + 0.008*"free" + 0.008*"msp" + 0.008*"room"

Topic: 1 
Words: 0.069*"flight" + 0.013*"us" + 0.013*"gate" + 0.013*"early" + 0.012*"time" + 0.011*"delayed" + 0.009*"delay" + 0.009*"get" + 0.009*"got" + 0.009*"arrived"

Topic: 2 
Words: 0.118*"flight" + 0.112*"time" + 0.089*"friendly" + 0.078*"staff" + 0.034*"great" + 0.031*"easy" + 0.031*"nice" + 0.029*"smooth" + 0.029*"crew" + 0.028*"attendants"

Topic: 3 
Words: 0.135*"service" + 0.125*"good" + 0.104*"great" + 0.046*"experience" + 0.038*"always" + 0.034*"time" + 0.026*"customer" + 0.023*"prices" + 0.020*"flights" + 0.019*"value"

Topic: 4 
Words: 0.128*"price" + 0.069*"everything" + 0.059*"flight" + 0.054*"went" + 0.045*"direct" + 0.029*"flights" + 0.025*"well" + 0.025*"reasonable" + 0.022*"cost" + 0.020*"smoothly"

Low Score Comments Topics:
Topic: 0 
Words: 0.097*"flight" + 0.050*"delay

In [None]:
# regression modeling