In [1]:
import pandas as pd
import nltk
import random
import numpy as np
import string
import re
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from gensim import corpora, models
import statsmodels.api as sm

In [2]:
#read in CSV file
survey_response = pd.read_csv('survey_response.csv', encoding='ISO-8859-1')

In [3]:
# Define the list of words to ignore
unwanted_words = {'plane', 'sun', 'country', 'fly', 'flying'}

In [4]:

def process_text(text):
    # Typecast to string if text is not already a string
    if not isinstance(text, str):
        text = str(text)
        
    # Load English stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase for case folding
    text = text.lower()
    
    # Remove punctuation using a regular expression
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text by splitting on whitespace
    tokens = text.split()
    
     # Remove stop words and unwanted words from tokens
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in unwanted_words]
    
    return filtered_tokens

In [5]:

def run_lda_analysis(file_path, num_topics=5, passes=20, random_state=42):
    # Read and preprocess text
    texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for comment in file:
            processed_comment = process_text(comment)
            if processed_comment:  # Check if comment is not empty after preprocessing
                texts.append(processed_comment)
    
    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    return lda_model, corpus, dictionary


In [6]:

# File paths
high_score_file_path = 'high_score_comments.txt'
low_score_file_path = 'low_score_comments.txt'

In [7]:

# Run LDA analysis separately
high_score_lda_model, high_score_corpus, high_score_dictionary = run_lda_analysis(high_score_file_path)
low_score_lda_model, low_score_corpus, low_score_dictionary = run_lda_analysis(low_score_file_path)

In [8]:

# Print topics for high score comments
print("High Score Comments Topics:")
for idx, topic in high_score_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

# Print topics for low score comments
print("Low Score Comments Topics:")
for idx, topic in low_score_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

High Score Comments Topics:
Topic: 0 
Words: 0.041*"flight" + 0.019*"msp" + 0.015*"love" + 0.013*"issues" + 0.012*"like" + 0.012*"2" + 0.012*"delayed" + 0.012*"terminal" + 0.012*"flights" + 0.011*"gate"

Topic: 1 
Words: 0.025*"flight" + 0.019*"seats" + 0.016*"seat" + 0.011*"would" + 0.009*"way" + 0.009*"one" + 0.007*"like" + 0.007*"us" + 0.007*"bag" + 0.007*"free"

Topic: 2 
Words: 0.154*"great" + 0.140*"service" + 0.130*"good" + 0.049*"flight" + 0.047*"experience" + 0.042*"price" + 0.027*"customer" + 0.026*"time" + 0.024*"prices" + 0.021*"excellent"

Topic: 3 
Words: 0.110*"time" + 0.107*"flight" + 0.101*"friendly" + 0.088*"staff" + 0.037*"flights" + 0.036*"crew" + 0.035*"nice" + 0.030*"attendants" + 0.027*"direct" + 0.026*"helpful"

Topic: 4 
Words: 0.058*"price" + 0.058*"everything" + 0.050*"time" + 0.048*"easy" + 0.045*"went" + 0.035*"flight" + 0.029*"well" + 0.028*"smooth" + 0.019*"boarding" + 0.017*"smoothly"

Low Score Comments Topics:
Topic: 0 
Words: 0.092*"flight" + 0.025*"a

In [9]:
# read in CSV file
manipulated_data_combine = pd.read_csv('manipulated_data_combine.csv', encoding='ISO-8859-1')


In [21]:
# Ensure 'Departure Date' is a datetime type
manipulated_data_combine['Departure Date'] = pd.to_datetime(manipulated_data_combine['Departure Date'])

# Function to determine the season based on the month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    else:
        return 4  # Autumn

# Apply the function to create the 'season' column
manipulated_data_combine['Season'] = manipulated_data_combine['Departure Date'].apply(get_season)

In [12]:


# List of columns to include in the regression model
model_columns = ['Time in Air', 'Delayed', 'score']

# Check if there are other numeric columns that should be included in the model
# Add them to model_columns as needed

# Create X and y for the new model
X = manipulated_data_combine[model_columns].drop('score', axis=1)
y = manipulated_data_combine['score']

# Convert all columns in X to float64 to ensure numeric consistency
X = X.astype(float)

# Add a constant to X and fit the OLS model
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.039
Model:                            OLS   Adj. R-squared:                  0.039
Method:                 Least Squares   F-statistic:                     1397.
Date:                Tue, 19 Mar 2024   Prob (F-statistic):               0.00
Time:                        12:15:04   Log-Likelihood:            -1.1301e+05
No. Observations:               68358   AIC:                         2.260e+05
Df Residuals:                   68355   BIC:                         2.261e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           4.1163      0.019    220.618      

In [22]:
# Assuming you want to include these in your regression model along with the previous variables
model_columns = ['Time in Air', 'Delayed', 'Departure Delay Time', 'Arrival Delay Time', 'score', 'Season']

# Create X and y for the new model
X = manipulated_data_combine[model_columns].drop('score', axis=1)
y = manipulated_data_combine['score']

# Convert all columns in X to float64 to ensure numeric consistency
X = X.astype(float)

# Add a constant to X and fit the OLS model
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.082
Model:                            OLS   Adj. R-squared:                  0.082
Method:                 Least Squares   F-statistic:                     1218.
Date:                Tue, 19 Mar 2024   Prob (F-statistic):               0.00
Time:                        14:23:10   Log-Likelihood:            -1.1147e+05
No. Observations:               68358   AIC:                         2.229e+05
Df Residuals:                   68352   BIC:                         2.230e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    4.1262 

: 