In [None]:
import pandas as pd
import nltk
import random
import numpy as np
import string
import re
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from gensim import corpora, models
import statsmodels.api as sm
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
#read in CSV file
survey_response = pd.read_csv('survey_response.csv', encoding='ISO-8859-1')

In [None]:
# Define the list of words to ignore
unwanted_words = {'plane', 'sun', 'country', 'fly', 'flying'}

In [None]:
# Define the function to preprocess the text

def process_text(text):
    # Typecast to string if text is not already a string
    if not isinstance(text, str):
        text = str(text)
        
    # Load English stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase for case folding
    text = text.lower()
    
    # Remove punctuation using a regular expression
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text by splitting on whitespace
    tokens = text.split()
    
     # Remove stop words and unwanted words from tokens
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in unwanted_words]
    
    return filtered_tokens

In [None]:
## LDA Modeling

# Define the function to run LDA analysis

def run_lda_analysis(file_path, num_topics=5, passes=20, random_state=42):
    # Read and preprocess text
    texts = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for comment in file:
            processed_comment = process_text(comment)
            if processed_comment:  # Check if comment is not empty after preprocessing
                texts.append(processed_comment)
    
    # Create a dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # LDA model
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    return lda_model, corpus, dictionary


In [None]:

# File paths
high_score_file_path = 'high_score_comments.txt'
low_score_file_path = 'low_score_comments.txt'

In [None]:

# Run LDA analysis separately
high_score_lda_model, high_score_corpus, high_score_dictionary = run_lda_analysis(high_score_file_path)
low_score_lda_model, low_score_corpus, low_score_dictionary = run_lda_analysis(low_score_file_path)

In [None]:

# Print topics for high score comments
print("High Score Comments Topics:")
for idx, topic in high_score_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

In [None]:

# Print topics for low score comments
print("Low Score Comments Topics:")
for idx, topic in low_score_lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

In [None]:
## Regression Modeling

# read in the csv file that contains the manipulated data (manipulated to fix date issues related to flight departure date and the date a customer began traveling)
manipulated_data_combine = pd.read_csv('manipulated_data_combine.csv', encoding='ISO-8859-1')


In [None]:
# Ensure 'Departure Date' is a datetime type
manipulated_data_combine['Departure Date'] = pd.to_datetime(manipulated_data_combine['Departure Date'])

In [None]:

# Function to determine the season based on the month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    else:
        return 4  # Autumn

In [None]:

# Apply the function to create the 'season' column
manipulated_data_combine['Season'] = manipulated_data_combine['Departure Date'].apply(get_season)

In [None]:
# Regression Model 1: Using 'Time in Air' and 'Delayed' as predictors

# List of columns to include in the regression model
model_columns = ['Time in Air', 'Delayed', 'score']

# Check if there are other numeric columns that should be included in the model
# Add them to model_columns as needed

# Create X and y for the new model
X = manipulated_data_combine[model_columns].drop('score', axis=1)
y = manipulated_data_combine['score']

# Convert all columns in X to float64 to ensure numeric consistency
X = X.astype(float)

# Add a constant to X and fit the OLS model
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

In [None]:
#Regression Model 2: Including 'Departure Delay Time' and 'Arrival Delay Time' as predictors along with the prior predictors

# Including more independent variables in the regression model
model_columns = ['Time in Air', 'Delayed', 'Departure Delay Time', 'Arrival Delay Time', 'score', 'Season']

# Create X and y for the new model
X = manipulated_data_combine[model_columns].drop('score', axis=1)
y = manipulated_data_combine['score']

# Convert all columns in X to float64 to ensure numeric consistency
X = X.astype(float)

# Add a constant to X and fit the OLS model
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())