In [1]:
import pandas as pd
import nltk
import random
import numpy as np
import string
import re
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords

In [2]:
survey_response = pd.read_csv('survey_response.csv', encoding='ISO-8859-1')

In [6]:
# Count of unique values in 'ServiceStartCity'
start_city_counts = survey_response['ServiceStartCity'].value_counts()

# Count of unique values in 'ServiceEndCity'
end_city_counts = survey_response['ServiceEndCity'].value_counts()

# If you want to print or view the counts
print("Start City Counts:")
print(start_city_counts)
print("\nEnd City Counts:")
print(end_city_counts)

Start City Counts:
ServiceStartCity
MSP    18812
RSW     5331
LAS     4847
PHX     4703
MCO     3979
       ...  
SLC        1
CHS        1
BDL        1
BTV        1
GPT        1
Name: count, Length: 97, dtype: int64

End City Counts:
ServiceEndCity
MSP    44844
RSW     2570
PHX     2023
MCO     1472
LAX     1256
       ...  
CHS        1
BDL        1
SLC        1
SDF        1
BKG        1
Name: count, Length: 88, dtype: int64


In [8]:
# Counting each value in the 'score' column
score_counts = survey_response['score'].value_counts()

# Sorting the counts by the score values if needed (e.g., for numeric scores)
score_counts_sorted = score_counts.sort_index()

print(score_counts_sorted)

score
1     6101
2     4314
3     7364
4    15578
5    35353
Name: count, dtype: int64


In [4]:
# Convert 'ServiceStartDate' to datetime format
survey_response['ServiceStartDate'] = pd.to_datetime(survey_response['ServiceStartDate'], format='%m/%d/%Y')

# Define a function to determine the season based on the month
def get_season(month):
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'fall'
    else: # months 12, 1, 2
        return 'winter'

# Apply the function to each row to create the 'season' column
survey_response['season'] = survey_response['ServiceStartDate'].dt.month.apply(get_season)

In [5]:
#Change the comment column to string data type from object data type
survey_response['comment'] = survey_response['comment'].astype(str)

In [6]:
# Grouping by 'City Pair' and then counting each 'score' value
score_counts_per_city_pair = survey_response.groupby(['City Pair', 'score']).size().unstack(fill_value=0)

# Optionally, if you want to focus on low scores, you can filter or manipulate the resulting DataFrame
# For example, if you consider scores <= 2 as low, you can do:
low_scores_per_city_pair = score_counts_per_city_pair.loc[:, score_counts_per_city_pair.columns <= 2]

# To find city pairs with the most low scores (summing across all low scores)
most_low_scores = low_scores_per_city_pair.sum(axis=1).sort_values(ascending=False)

print(most_low_scores)

City Pair
LAS-MSP    620
MCO-MSP    594
PHX-MSP    566
RSW-MSP    523
LAX-MSP    336
          ... 
MSP-CHS      0
CMH-MSP      0
MSP-PHL      0
MSP-BZN      0
JAC-MSP      0
Length: 239, dtype: int64


In [7]:
# Focus on scores of 4 and 5
high_scores_per_city_pair = score_counts_per_city_pair.loc[:, score_counts_per_city_pair.columns.isin([4, 5])]

# To find city pairs with the most high scores (summing across scores of 4 and 5)
most_high_scores = high_scores_per_city_pair.sum(axis=1).sort_values(ascending=False)

print(most_high_scores)

City Pair
LAS-MSP    3285
RSW-MSP    3099
PHX-MSP    2939
MCO-MSP    2917
MSP-RSW    1609
           ... 
MAF-DFW       0
SAT-MSP       0
MSP-AUS       0
BTV-MSP       0
IAH-DFW       0
Length: 239, dtype: int64


In [21]:
# Concatenate all comments into a single string
# You can separate each comment with a newline character for better readability
all_comments = '\n'.join(survey_response['comment'].astype(str))

# Write the combined string to a text file
with open('C:/Users/james/OneDrive/Documents/Capstone/Capstone/all_comments.txt', 'w', encoding='utf-8') as file:
    file.write(all_comments)

In [20]:
def process_text(text):
    # Load English stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase for case folding
    text = text.lower()
    
    # Remove punctuation using a regular expression
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text by splitting on whitespace
    tokens = text.split()
    
    # Remove stop words from tokens
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    return filtered_tokens

In [86]:
processed_all_comments= process_text(all_comments)

In [27]:
#Create a Function that gives summary stats dictionary for a given text
def get_patterns(text)  :
    """
        This function takes text as an input and returns a dictionary of statistics,
        after cleaning the text. 
    
    """
    
    
    # Calculate your statistics here
    total_tokens = len(text)
    unique_tokens = set(text)
    len_unique_tokens = len(unique_tokens)
    avg_token_len = sum([len(token) for token in text]) / total_tokens if total_tokens > 0 else 0
    lex_diversity = len_unique_tokens / total_tokens if total_tokens > 0 else 0
    top_10 = Counter(text).most_common(20)

    
    
    # Now we'll fill out the dictionary. 
    results = {'tokens':total_tokens,
               'unique_tokens':len_unique_tokens,
               'avg_token_length':avg_token_len,
               'lexical_diversity':lex_diversity,
               'top_10':top_10}

    return(results)


In [87]:
get_patterns(processed_all_comments)

{'tokens': 662581,
 'unique_tokens': 19420,
 'avg_token_length': 5.806447815436905,
 'lexical_diversity': 0.029309624030873206,
 'top_10': [('flight', 29069),
  ('time', 14267),
  ('service', 9867),
  ('great', 9003),
  ('good', 8128),
  ('staff', 7347),
  ('friendly', 7271),
  ('flights', 7049),
  ('plane', 6230),
  ('sun', 6027),
  ('country', 5733),
  ('price', 5456),
  ('delayed', 4364),
  ('experience', 3824),
  ('seats', 3804),
  ('get', 3725),
  ('seat', 3561),
  ('attendants', 3524),
  ('gate', 3410),
  ('hour', 3244)]}

In [27]:
# Filter the DataFrame for scores 4 and 5, and concatenate the comments
high_score_comments = '\n'.join(survey_response[survey_response['score'].isin([4, 5])]['comment'].astype(str))

# Write the high score comments to a text file
with open('C:/Users/james/OneDrive/Documents/Capstone/Capstone/high_score_comments.txt', 'w', encoding='utf-8') as file:
    file.write(high_score_comments)

# Filter the DataFrame for scores 1 and 2, and concatenate the comments
low_score_comments = '\n'.join(survey_response[survey_response['score'].isin([1, 2])]['comment'].astype(str))

# Write the low score comments to a text file
with open('C:/Users/james/OneDrive/Documents/Capstone/Capstone/low_score_comments.txt', 'w', encoding='utf-8') as file:
    file.write(low_score_comments)

In [88]:
processed_high_score_comments = process_text(high_score_comments)

In [90]:
processed_low_score_comments = process_text(low_score_comments)

In [94]:
# Check the type of 'processed_high_score_comments'
data_type = type(processed_high_score_comments)

print(data_type)

<class 'list'>


In [100]:
get_patterns(processed_high_score_comments)

{'tokens': 333201,
 'unique_tokens': 11989,
 'avg_token_length': 5.940327309942047,
 'lexical_diversity': 0.03598128456997428,
 'top_10': [('flight', 16716),
  ('time', 11533),
  ('great', 8518),
  ('service', 8027),
  ('good', 7398),
  ('friendly', 6917),
  ('staff', 6261),
  ('flights', 5322),
  ('price', 4968),
  ('sun', 3347),
  ('country', 3175),
  ('everything', 2862),
  ('experience', 2740),
  ('nice', 2646),
  ('easy', 2469),
  ('plane', 2416),
  ('attendants', 2408),
  ('crew', 2361),
  ('went', 2217),
  ('always', 2179)]}

In [101]:
get_patterns(processed_low_score_comments)

{'tokens': 212978,
 'unique_tokens': 11518,
 'avg_token_length': 5.668928246109926,
 'lexical_diversity': 0.05408070317121956,
 'top_10': [('flight', 7976),
  ('plane', 2497),
  ('delayed', 2442),
  ('hours', 1881),
  ('sun', 1848),
  ('hour', 1831),
  ('country', 1762),
  ('get', 1712),
  ('time', 1693),
  ('gate', 1624),
  ('us', 1449),
  ('service', 1276),
  ('one', 1276),
  ('seat', 1252),
  ('luggage', 1147),
  ('delay', 1145),
  ('would', 1131),
  ('seats', 1102),
  ('bag', 1076),
  ('airport', 1051)]}

In [102]:
#Main function to compare two corpora
def group_compare(corpus_1, corpus_2, num_words=10, ratio_cutoff=5):
    sum_stats_corp_1 = get_patterns(corpus_1)
    sum_stats_corp_2 = get_patterns(corpus_2)

    # Extract word frequencies for both corpora
    freq_1 = Counter(corpus_1)
    freq_2 = Counter(corpus_2)

    # Calculate ratios for words that appear at least ratio_cutoff times in both corpora
    ratios_one_vs_two = {}
    ratios_two_vs_one = {}

    for word, count in freq_1.items():
        if word in freq_2 and count >= ratio_cutoff and freq_2[word] >= ratio_cutoff:
            p_1 = count / sum_stats_corp_1["tokens"]
            p_2 = freq_2[word] / sum_stats_corp_2["tokens"]
            ratios_one_vs_two[word] = p_1 / p_2
            ratios_two_vs_one[word] = p_2 / p_1

    # Sort and get top num_words for both ratios
    top_ratios_one_vs_two = sorted(ratios_one_vs_two.items(), key=lambda x: x[1], reverse=True)[:num_words]
    top_ratios_two_vs_one = sorted(ratios_two_vs_one.items(), key=lambda x: x[1], reverse=True)[:num_words]

    results = {
        "one": sum_stats_corp_1,
        "two": sum_stats_corp_2,
        "one_vs_two": dict(top_ratios_one_vs_two),
        "two_vs_one": dict(top_ratios_two_vs_one)
    }

    return results

In [103]:
group_compare(processed_high_score_comments, processed_low_score_comments)

{'one': {'tokens': 333201,
  'unique_tokens': 11989,
  'avg_token_length': 5.940327309942047,
  'lexical_diversity': 0.03598128456997428,
  'top_10': [('flight', 16716),
   ('time', 11533),
   ('great', 8518),
   ('service', 8027),
   ('good', 7398),
   ('friendly', 6917),
   ('staff', 6261),
   ('flights', 5322),
   ('price', 4968),
   ('sun', 3347),
   ('country', 3175),
   ('everything', 2862),
   ('experience', 2740),
   ('nice', 2646),
   ('easy', 2469),
   ('plane', 2416),
   ('attendants', 2408),
   ('crew', 2361),
   ('went', 2217),
   ('always', 2179)]},
 'two': {'tokens': 212978,
  'unique_tokens': 11518,
  'avg_token_length': 5.668928246109926,
  'lexical_diversity': 0.05408070317121956,
  'top_10': [('flight', 7976),
   ('plane', 2497),
   ('delayed', 2442),
   ('hours', 1881),
   ('sun', 1848),
   ('hour', 1831),
   ('country', 1762),
   ('get', 1712),
   ('time', 1693),
   ('gate', 1624),
   ('us', 1449),
   ('service', 1276),
   ('one', 1276),
   ('seat', 1252),
   ('lug

: 

In [24]:
# Assuming 'score' is numerical and you want to sum or count scores
# Adjust this part if you're looking for highest/lowest based on different criteria

# Aggregate scores by 'City Pair'
agg_scores = survey_response.groupby('City Pair')['score'].count()

# Find top 5 city pairs based on aggregated scores - both highest and lowest
top_city_pairs = agg_scores.nlargest(5).index.tolist()
bottom_city_pairs = agg_scores.nsmallest(5).index.tolist()

In [28]:
results = {}

# Combine top and bottom city pairs for simplicity
city_pairs = set(top_city_pairs + bottom_city_pairs)

for city_pair in city_pairs:
    # Filter comments for the current city pair
    comments = survey_response[survey_response['City Pair'] == city_pair]['comment']
    
    # Preprocess each comment, then analyze
    city_pair_results = comments.apply(lambda x: get_patterns(process_text(x)))
    results[city_pair] = city_pair_results.tolist()

In [29]:
results

{'GPT-MSP': [{'tokens': 7,
   'unique_tokens': 7,
   'avg_token_length': 5.714285714285714,
   'lexical_diversity': 1.0,
   'top_10': [('gate', 1),
    ('agents', 1),
    ('flight', 1),
    ('attendants', 1),
    ('kind', 1),
    ('went', 1),
    ('beyond', 1)]}],
 'BDL-MSP': [{'tokens': 1,
   'unique_tokens': 1,
   'avg_token_length': 4.0,
   'lexical_diversity': 1.0,
   'top_10': [('time', 1)]}],
 'CHS-MSP': [{'tokens': 19,
   'unique_tokens': 18,
   'avg_token_length': 7.0,
   'lexical_diversity': 0.9473684210526315,
   'top_10': [('flight', 2),
    ('alaska', 1),
    ('airlines', 1),
    ('cancelled', 1),
    ('last', 1),
    ('minute', 1),
    ('suncountry', 1),
    ('leaving', 1),
    ('needed', 1),
    ('affordable', 1),
    ('price', 1),
    ('appreciate', 1),
    ('youll', 1),
    ('know', 1),
    ('traveling', 1),
    ('see', 1),
    ('granddaughters', 1),
    ('thank', 1)]}],
 'RSW-MSP': [{'tokens': 2,
   'unique_tokens': 2,
   'avg_token_length': 5.0,
   'lexical_diversity'