In [None]:
import pandas as pd
import nltk
import random
import numpy as np
import string
import re
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from gensim import corpora, models

In [None]:
# Load survey response data
survey_response = pd.read_csv('survey_response.csv', encoding='ISO-8859-1')

In [None]:
# Count each value in the 'score' column
score_counts = survey_response['score'].value_counts().sort_index()
print(score_counts)

In [None]:
# Convert 'ServiceStartDate' to datetime format
survey_response['ServiceStartDate'] = pd.to_datetime(survey_response['ServiceStartDate'], format='%m/%d/%Y')

In [None]:

# Define a function to determine the season based on the month
def get_season(month):
    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'fall'
    else: # months 12, 1, 2
        return 'winter'

In [None]:

# Apply the function to each row to create the 'season' column
survey_response['season'] = survey_response['ServiceStartDate'].dt.month.apply(get_season)

In [None]:
#Change the comment column to string data type from object data type
survey_response['comment'] = survey_response['comment'].astype(str)

In [None]:
# Concatenate all comments into a single string
# You can separate each comment with a newline character for better readability
all_comments = '\n'.join(survey_response['comment'].astype(str))

In [None]:

# Write the combined string to a text file
with open('C:/Users/james/OneDrive/Documents/Capstone/Capstone/all_comments.txt', 'w', encoding='utf-8') as file:
    file.write(all_comments)

In [None]:
def process_text(text):
    # Typecast to string if text is not already a string
    if not isinstance(text, str):
        text = str(text)
        
    # Load English stop words from NLTK
    stop_words = set(stopwords.words('english'))

    # Convert text to lowercase for case folding
    text = text.lower()
    
    # Remove punctuation using a regular expression
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize the text by splitting on whitespace
    tokens = text.split()
    
    # Remove stop words from tokens
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    return filtered_tokens

In [None]:
# Put all comments through the process_text function
processed_all_comments= process_text(all_comments)

In [None]:
#Create a Function that gives summary stats dictionary for a given text
def get_patterns(text)  :
    """
        This function takes text as an input and returns a dictionary of statistics,
        after cleaning the text. 
    
    """
    
    
    # Calculate your statistics here
    total_tokens = len(text)
    unique_tokens = set(text)
    len_unique_tokens = len(unique_tokens)
    avg_token_len = sum([len(token) for token in text]) / total_tokens if total_tokens > 0 else 0
    lex_diversity = len_unique_tokens / total_tokens if total_tokens > 0 else 0
    top_20 = Counter(text).most_common(20)

    
    
    # Now we'll fill out the dictionary. 
    results = {'tokens':total_tokens,
               'unique_tokens':len_unique_tokens,
               'avg_token_length':avg_token_len,
               'lexical_diversity':lex_diversity,
               'top_20':top_20}

    return(results)


In [None]:
# Put all processed comments through the get_patterns function
get_patterns(processed_all_comments)

In [None]:
# Filter the DataFrame for scores 4 and 5, and concatenate the comments
high_score_comments = '\n'.join(survey_response[survey_response['score'].isin([4, 5])]['comment'].astype(str))

# Write the high score comments to a text file
with open('C:/Users/james/OneDrive/Documents/Capstone/Capstone/high_score_comments.txt', 'w', encoding='utf-8') as file:
    file.write(high_score_comments)

In [None]:

# Filter the DataFrame for scores 1, 2, and 3 and concatenate the comments
low_score_comments = '\n'.join(survey_response[survey_response['score'].isin([1, 2, 3])]['comment'].astype(str))

# Write the low score comments to a text file
with open('C:/Users/james/OneDrive/Documents/Capstone/Capstone/low_score_comments.txt', 'w', encoding='utf-8') as file:
    file.write(low_score_comments)


In [None]:
# Put all high score comments through the process_text function
processed_high_score_comments = process_text(high_score_comments)

In [None]:
# Put all low score comments through the process_text function
processed_low_score_comments = process_text(low_score_comments)

In [None]:
# Put all processed high score comments through the get_patterns function
get_patterns(processed_high_score_comments)

In [None]:
# Put all processed low score comments through the get_patterns function
get_patterns(processed_low_score_comments)

In [None]:
#Main function to compare the relative use of words across two corpora
def group_compare(corpus_1, corpus_2, num_words=10, ratio_cutoff=5):
    sum_stats_corp_1 = get_patterns(corpus_1)
    sum_stats_corp_2 = get_patterns(corpus_2)

    # Extract word frequencies for both corpora
    freq_1 = Counter(corpus_1)
    freq_2 = Counter(corpus_2)

    # Calculate ratios for words that appear at least ratio_cutoff times in both corpora
    ratios_one_vs_two = {}
    ratios_two_vs_one = {}

    for word, count in freq_1.items():
        if word in freq_2 and count >= ratio_cutoff and freq_2[word] >= ratio_cutoff:
            p_1 = count / sum_stats_corp_1["tokens"]
            p_2 = freq_2[word] / sum_stats_corp_2["tokens"]
            ratios_one_vs_two[word] = p_1 / p_2
            ratios_two_vs_one[word] = p_2 / p_1

    # Sort and get top num_words for both ratios
    top_ratios_one_vs_two = sorted(ratios_one_vs_two.items(), key=lambda x: x[1], reverse=True)[:num_words]
    top_ratios_two_vs_one = sorted(ratios_two_vs_one.items(), key=lambda x: x[1], reverse=True)[:num_words]

    results = {
        "one": sum_stats_corp_1,
        "two": sum_stats_corp_2,
        "one_vs_two": dict(top_ratios_one_vs_two),
        "two_vs_one": dict(top_ratios_two_vs_one)
    }

    return results

In [None]:
# Compare the word use relatively across high and low score comments
group_compare(processed_high_score_comments, processed_low_score_comments)