In [1]:
import pandas as pd
import os
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import ngrams
from collections import Counter

# Downloaded the stopwords through nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lindsaym\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lindsaym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('responses.csv', encoding='utf-8')

In [3]:
# Select emails where source is equal to KP and the survey has been completed.
kp_responses = df.loc[(df['Response Type'] == 'completed') & (df['Where did you receive this survey from?'] == 'Knit Picks')].copy()
# List size
kp_responses.shape

(2673, 210)

# KP Processing

In [4]:
# Drop BY question columns
kp_columns = kp_responses.columns[kp_responses.isnull().all()]
# Other unncessary columns.
kp_other_columns = ["Response Type", "Start Date (UTC)", "Submit Date (UTC)", "Network ID"]
# Combine both lists.
columns_to_drop = list(set(kp_columns.to_list() + kp_other_columns))
# Drop the columns from kp_responses
kp_responses.drop(columns=columns_to_drop, inplace=True)

In [5]:
kp_responses.shape

(2673, 194)

In [6]:
# Concatenate 'How do you most often purchase *yarn*?' and 'Other'.
kp_responses['How do you most often purchase *yarn*?'] = kp_responses.apply(
    lambda row: row['Other'] if pd.notnull(row['Other']) and row['Other'].strip() != '' and (pd.isnull(row['How do you most often purchase *yarn*?']) or row['How do you most often purchase *yarn*?'].strip() == '') else row['How do you most often purchase *yarn*?'],
    axis=1
)
# Drop the 'Other' column
kp_responses.drop(columns=['Other'], inplace=True)

In [7]:
# Concatenate 'Where do you most often purchase *yarn*?' and 'Other.1'.
kp_responses['Where do you most often purchase* yarn*?'] = kp_responses.apply(
    lambda row: row['Other.1'] if pd.notnull(row['Other.1']) and row['Other.1'].strip() != '' and (pd.isnull(row['Where do you most often purchase* yarn*?']) or row['Where do you most often purchase *yarn*?'].strip() == '') else row['Where do you most often purchase* yarn*?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.1'], inplace=True)

In [8]:
# Concatenate 'How do you most often purchase yarn *tools & accessories*?' and 'Other.2'.
kp_responses['How do you most often purchase yarn *tools & accessories*?'] = kp_responses.apply(
    lambda row: row['Other.2'] if pd.notnull(row['Other.2']) and row['Other.2'].strip() != '' and (pd.isnull(row['How do you most often purchase yarn *tools & accessories*?']) or row['How do you most often purchase yarn *tools & accessories*?'].strip() == '') else row['How do you most often purchase yarn *tools & accessories*?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.2'], inplace=True)

In [9]:
# Concatenate 'Where do you most often purchase *tools & accessories*?' and 'Other.3'.
kp_responses['Where do you most often purchase *tools & accessories*?'] = kp_responses.apply(
    lambda row: row['Other.3'] if pd.notnull(row['Other.3']) and row['Other.3'].strip() != '' and (pd.isnull(row['Where do you most often purchase *tools & accessories*?']) or row['Where do you most often purchase *tools & accessories*?'].strip() == '') else row['Where do you most often purchase *tools & accessories*?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.3'], inplace=True)

In [10]:
# Concatenate 'How do you most often purchase *patterns*?' and 'Other.4'.
kp_responses['How do you most often purchase *patterns*?'] = kp_responses.apply(
    lambda row: row['Other.4'] if pd.notnull(row['Other.4']) and row['Other.4'].strip() != '' and (pd.isnull(row['How do you most often purchase *patterns*?']) or row['How do you most often purchase *patterns*?'].strip() == '') else row['How do you most often purchase *patterns*?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.4'], inplace=True)

In [11]:
# Concatenate 'Where do you most often purchase *patterns*?' and 'Other.5'.
kp_responses['Where do you most often purchase *patterns*?'] = kp_responses.apply(
    lambda row: row['Other.5'] if pd.notnull(row['Other.5']) and row['Other.5'].strip() != '' and (pd.isnull(row['Where do you most often purchase *patterns*?']) or row['Where do you most often purchase *patterns*?'].strip() == '') else row['Where do you most often purchase *patterns*?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.5'], inplace=True)

In [12]:
# Concatenate 'How did you first hear about our brand?' and 'Other.6'.
kp_responses['How did you first hear about our brand?'] = kp_responses.apply(
    lambda row: row['Other.6'] if pd.notnull(row['Other.6']) and row['Other.6'].strip() != '' and (pd.isnull(row['How did you first hear about our brand?']) or row['How did you first hear about our brand?'].strip() == '') else row['How did you first hear about our brand?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.6'], inplace=True)

In [13]:
# Concatenate 'Which social media platform?' and 'Other.8'.
kp_responses['Which social media platform?'] = kp_responses.apply(
    lambda row: row['Other.8'] if pd.notnull(row['Other.8']) and row['Other.8'].strip() != '' and (pd.isnull(row['Which social media platform?']) or row['Which social media platform?'].strip() == '') else row['Which social media platform?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.8'], inplace=True)

In [14]:
# Concatenate 'Which social media platform?.1' and 'Other.9'.
kp_responses['Which social media platform?.1'] = kp_responses.apply(
    lambda row: row['Other.9'] if pd.notnull(row['Other.9']) and row['Other.9'].strip() != '' and (pd.isnull(row['Which social media platform?.1']) or row['Which social media platform?.1'].strip() == '') else row['Which social media platform?.1'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.9'], inplace=True)

In [15]:
# Concatenate 'When shopping online, what encourages you to try *new* yarn products?' and 'Other.10'.
kp_responses['When shopping online, what encourages you to try *new* yarn products?'] = kp_responses.apply(
    lambda row: row['Other.10'] if pd.notnull(row['Other.10']) and row['Other.10'].strip() != '' and (pd.isnull(row['When shopping online, what encourages you to try *new* yarn products?']) or row['When shopping online, what encourages you to try *new* yarn products?'].strip() == '') else row['When shopping online, what encourages you to try *new* yarn products?'],
    axis=1
)
# Drop the 'Other.1' column
kp_responses.drop(columns=['Other.10'], inplace=True)

In [16]:
heard_concatenate = [
    "Amano", "Berroco", "Blue Sky Fibers", "Brown Sheep", "Caron", "Cascade Yarns", "Circulo",
    "Crochet.com", "Darn Good Yarn", "Fibre Company", "Freia Handpaints", "Hedgehog Fibres",
    "Hobbii", "Jimmy Beans Wool", "Katia", "Knit Picks", "Koigu", "Lana Grossa", "Lang", "Lily",
    "Lion Brand Yarn", "Lopo", "LoveCrafts", "MadelineTosh", "Malabrigo Yarn", "Manos del Uruguay",
    "Mary Maxim", "Noro", "Novita", "Paintbox", "Plymouth", "Premier Yarns", "Purl Soho", "Red Heart",
    "Regia", "Rowan", "Scheepjes", "Sirdar", "SpinCycle", "Trendsetter", "Universal Yarns",
    "WeAreKnitters", "Webs (Yarn.com)", "West Yorkshire Spinners", "Wool and the Gang", "WoolAddicts",
    "Yarnspirations"
]

# Concatenate values into one column, separating with commas and ignoring null/blank values
kp_responses['Which brands have you heard of?'] = kp_responses[heard_concatenate].apply(
    lambda row: ', '.join(filter(None, row.dropna())), axis=1
)

# Find the index for 'Amano' to insert the new column at this position
heard_index = kp_responses.columns.get_loc("Amano")

# Insert 'Which brands have you heard of?' column back at the position where 'Amano' was
kp_responses.insert(heard_index, 'Which brands have you heard of?', kp_responses.pop('Which brands have you heard of?'))

# Drop the original columns
kp_responses.drop(columns=heard_concatenate, inplace=True)

In [17]:
bought_concatenate = [
    "Amano.1", "Berroco.1", "Blue Sky Fibers.1", "Brown Sheep.1", "Caron.1", "Cascade Yarns.1", "Circulo.1",
    "Crochet.com.1", "Darn Good Yarn.1", "Fibre Company.1", "Freia Handpaints.1", "Hedgehog Fibres.1",
    "Hobbii.1", "Jimmy Beans Wool.1", "Katia.1", "Knit Picks.1", "Koigu.1", "Lana Grossa.1", "Lang.1", "Lily.1",
    "Lion Brand Yarn.1", "Lopo.1", "LoveCrafts.1", "MadelineTosh.1", "Malabrigo Yarn.1", "Manos del Uruguay.1",
    "Mary Maxim.1", "Noro.1", "Novita.1", "Paintbox.1", "Plymouth.1", "Premier Yarns.1", "Purl Soho.1", "Red Heart.1",
    "Regia.1", "Rowan.1", "Scheepjes.1", "Sirdar.1", "SpinCycle.1", "Trendsetter.1", "Universal Yarns.1",
    "WeAreKnitters.1", "Webs (Yarn.com).1", "West Yorkshire Spinners.1", "Wool and the Gang.1", "WoolAddicts.1",
    "Yarnspirations.1"
]

# Concatenate values into one column, separating with commas and ignoring null/blank values
kp_responses['Which brands have you purchased from?'] = kp_responses[bought_concatenate].apply(
    lambda row: ', '.join(filter(None, row.dropna())), axis=1
)

# Find the index for 'Amano' to insert the new column at this position
bought_index = kp_responses.columns.get_loc("Amano.1")

# Insert 'Which brands have you heard of?' column back at the position where 'Amano' was
kp_responses.insert(bought_index, 'Which brands have you purchased from?', kp_responses.pop('Which brands have you purchased from?'))


# Drop the original columns
kp_responses.drop(columns=bought_concatenate, inplace=True)

In [18]:
fibers_concatenate = [
    "Wool yarns", "Cotton yarns", "Synthetic yarns (e.g., acrylic, nylon)",  
    "Luxury yarns (e.g., silk, cashmere)", "Blended yarns", "Other.11"
]

# Concatenate values into one column, separating with commas and ignoring null/blank values
kp_responses['Which of our yarn fibers do you frequently purchase?'] = kp_responses[fibers_concatenate].apply(
    lambda row: ', '.join(filter(None, row.dropna())), axis=1
)

# Find the index for 'Amano' to insert the new column at this position
fibers_index = kp_responses.columns.get_loc("Wool yarns")

# Insert 'Which brands have you heard of?' column back at the position where 'Amano' was
kp_responses.insert(fibers_index, 'Which of our yarn fibers do you frequently purchase?', kp_responses.pop('Which of our yarn fibers do you frequently purchase?'))


# Drop the original columns
kp_responses.drop(columns=fibers_concatenate, inplace=True)

In [19]:
projects_concatenate = [
    "Accessories (e.g., scarves, shawls)", "Clothing items (e.g., sweaters, hats, mittens)", "Baby (e.g., smaller sweaters, hats, mittens)",  
    "Home decor (e.g., blankets, cushions)", "Menswear (e.g., bigger sweaters, hats, mittens)", "Toys and amigurumi", "Other.12"
]

# Concatenate values into one column, separating with commas and ignoring null/blank values
kp_responses['Which types of projects do you typically buy our yarn for?'] = kp_responses[projects_concatenate].apply(
    lambda row: ', '.join(filter(None, row.dropna())), axis=1
)

# Find the index for 'Amano' to insert the new column at this position
projects_index = kp_responses.columns.get_loc("Accessories (e.g., scarves, shawls)")

# Insert 'Which brands have you heard of?' column back at the position where 'Amano' was
kp_responses.insert(projects_index, 'Which types of projects do you typically buy our yarn for?', kp_responses.pop('Which types of projects do you typically buy our yarn for?'))


# Drop the original columns
kp_responses.drop(columns=projects_concatenate, inplace=True)

In [20]:
project_ideas_concatenate = [
    "Facebook", "Instagram", "Twitter", "Pinterest", "Ravelry",
    "YouTube", "TikTok", "I don't use social media.", "Other.13"
]

# Concatenate values into one column, separating with commas and ignoring null/blank values
kp_responses['Which social media platforms do you normally get project ideas from?'] = kp_responses[project_ideas_concatenate].apply(
    lambda row: ', '.join(filter(None, row.dropna())), axis=1
)

# Find the index for 'Amano' to insert the new column at this position
project_ideas_index = kp_responses.columns.get_loc("Facebook")

# Insert 'Which brands have you heard of?' column back at the position where 'Amano' was
kp_responses.insert(project_ideas_index, 'Which social media platforms do you normally get project ideas from?', kp_responses.pop('Which social media platforms do you normally get project ideas from?'))


# Drop the original columns
kp_responses.drop(columns=project_ideas_concatenate, inplace=True)

In [21]:
kp_responses.to_csv('kp_processed.csv', encoding='utf-8-sig', index=False)

# KP Summarization

In [22]:
# Text wrapping is good actually.
pd.set_option('display.width', 100)

# Create list of all questions to summarize.
kp_responses.columns.tolist()

# Defintion to churn out totals/percentages to avoid copy pasta.
def summarize_column(df, column_name):
    non_null_data = df[column_name].dropna()
    summary = non_null_data.value_counts()
    percentages = (non_null_data.value_counts(normalize=True) * 100).round(1)
    summary = pd.DataFrame({'Total': summary, 'Percentage': percentages})
    sample_size = non_null_data.shape[0]
    return summary, sample_size

In [23]:
# Do you participate in any yarn crafts?
question = kp_responses.columns[3]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Do you participate in any yarn crafts? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Do you participate in any yarn crafts?,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,2673,100.0


In [24]:
# Do you crochet?
question = kp_responses.columns[4]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Do you crochet? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Do you crochet?,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,1736,64.9
No,937,35.1


In [25]:
# What is your current skill level in crocheting?
question = kp_responses.columns[5]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: What is your current skill level in crocheting? (Sample Size: n = 1670)


Unnamed: 0_level_0,Total,Percentage
What is your current skill level in crocheting?,Unnamed: 1_level_1,Unnamed: 2_level_1
Intermediate,704,42.2
Advanced,499,29.9
Beginner,329,19.7
Expert,121,7.2
Professional,17,1.0


In [26]:
# How often do you crochet?
question = kp_responses.columns[6]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How often do you crochet? (Sample Size: n = 1670)


Unnamed: 0_level_0,Total,Percentage
How often do you crochet?,Unnamed: 1_level_1,Unnamed: 2_level_1
A few times a year,596,35.7
Daily,256,15.3
A few times a week,252,15.1
Once a year or less,226,13.5
Monthly,203,12.2
Weekly,137,8.2


In [27]:
# Do you knit?
question = kp_responses.columns[7]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Do you knit? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Do you knit?,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,2550,95.4
No,123,4.6


In [28]:
# What is your current skill level in knitting?
question = kp_responses.columns[8]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: What is your current skill level in knitting? (Sample Size: n = 2438)


Unnamed: 0_level_0,Total,Percentage
What is your current skill level in knitting?,Unnamed: 1_level_1,Unnamed: 2_level_1
Advanced,1097,45.0
Intermediate,818,33.6
Expert,350,14.4
Beginner,125,5.1
Professional,48,2.0


In [29]:
# How often do you knit?
question = kp_responses.columns[9]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How often do you knit? (Sample Size: n = 2437)


Unnamed: 0_level_0,Total,Percentage
How often do you knit?,Unnamed: 1_level_1,Unnamed: 2_level_1
Daily,1402,57.5
A few times a week,592,24.3
Weekly,188,7.7
Monthly,136,5.6
A few times a year,107,4.4
Once a year or less,12,0.5


In [30]:
# Do you participate in yarn crafts besides crochet or knitting?
question = kp_responses.columns[10]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Do you participate in yarn crafts besides crochet or knitting? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Do you participate in yarn crafts besides crochet or knitting?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1780,66.6
Yes,893,33.4


In [31]:
# Which yarn craft do you participate in?

# Case sensitive, cast lower case. 
kp_responses.iloc[:, 11] = kp_responses.iloc[:, 11].str.lower()
# Reset null values.
kp_responses.iloc[:, 11] = kp_responses.iloc[:, 11].astype(str).str.lower().replace('nan', np.nan)

# Do you participate in yarn crafts besides crochet or knitting?
question = kp_responses.columns[11]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Which yarn craft do you participate in? (Sample Size: n = 842)


Unnamed: 0_level_0,Total,Percentage
Which yarn craft do you participate in?,Unnamed: 1_level_1,Unnamed: 2_level_1
weaving,100,11.9
macrame,89,10.6
spinning,55,6.5
embroidery,31,3.7
needlepoint,26,3.1
cross stitch,23,2.7
tapestries,15,1.8
"spinning, weaving",12,1.4
needle felting,12,1.4
felting,10,1.2


In [32]:
# What is your current skill level in this yarn craft?	
question = kp_responses.columns[12]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: What is your current skill level in this yarn craft? (Sample Size: n = 840)


Unnamed: 0_level_0,Total,Percentage
What is your current skill level in this yarn craft?,Unnamed: 1_level_1,Unnamed: 2_level_1
Intermediate,360,42.9
Beginner,291,34.6
Advanced,160,19.0
Expert,21,2.5
Professional,8,1.0


In [33]:
# How often do you you participate in this yarn craft?
question = kp_responses.columns[13]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How often do you you participate in this yarn craft? (Sample Size: n = 831)


Unnamed: 0_level_0,Total,Percentage
How often do you you participate in this yarn craft?,Unnamed: 1_level_1,Unnamed: 2_level_1
A few times a year,317,38.1
Monthly,166,20.0
Weekly,106,12.8
A few times a week,101,12.2
Once a year or less,92,11.1
Daily,49,5.9


In [34]:
# How do you most often purchase *yarn*?	
question = kp_responses.columns[14]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(2))

Question: How do you most often purchase *yarn*? (Sample Size: n = 2546)


Unnamed: 0_level_0,Total,Percentage
How do you most often purchase *yarn*?,Unnamed: 1_level_1,Unnamed: 2_level_1
Online,1814,71.2
In Person,688,27.0


In [35]:
# Where do you most often purchase* yarn*?	
question = kp_responses.columns[15]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(7))

Question: Where do you most often purchase* yarn*? (Sample Size: n = 2550)


Unnamed: 0_level_0,Total,Percentage
Where do you most often purchase* yarn*?,Unnamed: 1_level_1,Unnamed: 2_level_1
"Online Store (Knitpicks, Crochet.com, etc...)",1441,56.5
Local Yarn Store,501,19.6
"Chain Store (Joann's, Michaels, Hobby Lobby, etc...)",325,12.7
No preference,119,4.7
"Online Marketplace (Facebook, Etsy, Ravelry, etc...)",49,1.9
"Big Box Store (Walmart, Target, etc...)",27,1.1
Amazon,26,1.0


In [36]:
# How do you most often purchase yarn *tools & accessories*?
question = kp_responses.columns[16]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(2))

Question: How do you most often purchase yarn *tools & accessories*? (Sample Size: n = 2552)


Unnamed: 0_level_0,Total,Percentage
How do you most often purchase yarn *tools & accessories*?,Unnamed: 1_level_1,Unnamed: 2_level_1
Online,1916,75.1
In Person,609,23.9


In [37]:
# Where do you most often purchase *tools & accessories*?
question = kp_responses.columns[17]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(7))

Question: Where do you most often purchase *tools & accessories*? (Sample Size: n = 2547)


Unnamed: 0_level_0,Total,Percentage
Where do you most often purchase *tools & accessories*?,Unnamed: 1_level_1,Unnamed: 2_level_1
"Online Store (Knitpicks, Crochet.com, etc...)",1251,49.1
Local Yarn Store,371,14.6
Amazon,363,14.3
"Chain Store (Joann's, Michaels, Hobby Lobby, etc...)",265,10.4
No preference,197,7.7
"Online Marketplace (Facebook, Etsy, Ravelry, etc...)",53,2.1
"Big Box Store (Walmart, Target, etc...)",17,0.7


In [38]:
# How do you most often purchase *patterns*?
question = kp_responses.columns[18]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(2))

Question: How do you most often purchase *patterns*? (Sample Size: n = 2538)


Unnamed: 0_level_0,Total,Percentage
How do you most often purchase *patterns*?,Unnamed: 1_level_1,Unnamed: 2_level_1
Online,2351,92.6
In Person,91,3.6


In [39]:
# Where do you most often purchase *patterns*?
question = kp_responses.columns[19]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(6))

Question: Where do you most often purchase *patterns*? (Sample Size: n = 2542)


Unnamed: 0_level_0,Total,Percentage
Where do you most often purchase *patterns*?,Unnamed: 1_level_1,Unnamed: 2_level_1
"Online Marketplace (Facebook, Etsy, Ravelry, etc...)",1474,58.0
"Online Store (Knitpicks, Crochet.com, etc...)",565,22.2
No preference,232,9.1
Ravelry,69,2.7
Local Yarn Store,53,2.1
"Chain Store (Joann's, Michaels, Hobby Lobby, etc...)",24,0.9


In [40]:
# How have your yarn shopping habits changed over time?
question = kp_responses.columns[20]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How have your yarn shopping habits changed over time? (Sample Size: n = 2551)


Unnamed: 0_level_0,Total,Percentage
How have your yarn shopping habits changed over time?,Unnamed: 1_level_1,Unnamed: 2_level_1
I buy more yarn online now.,1539,60.3
No change.,816,32.0
I buy more yarn in person now.,196,7.7


In [41]:
# How many times do you make an *in-store* yarn related purchase each year?
question = kp_responses.columns[21]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How many times do you make an *in-store* yarn related purchase each year? (Sample Size: n = 2381)


Unnamed: 0_level_0,Total,Percentage
How many times do you make an *in-store* yarn related purchase each year?,Unnamed: 1_level_1,Unnamed: 2_level_1
2,382,16.0
3,365,15.3
4,322,13.5
10+,273,11.5
1,273,11.5
6,258,10.8
5,221,9.3
0,157,6.6
8,83,3.5
7,36,1.5


In [42]:
# How much do you typically spend per *in-store* purchase?
question = kp_responses.columns[22]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How much do you typically spend per *in-store* purchase? (Sample Size: n = 2489)


Unnamed: 0_level_0,Total,Percentage
How much do you typically spend per *in-store* purchase?,Unnamed: 1_level_1,Unnamed: 2_level_1
$41 - $50,382,15.3
$21 - $30,359,14.4
$11 - $20,296,11.9
$51 - $60,277,11.1
$31 - $40,272,10.9
$91 - $100,199,8.0
$71 - $80,130,5.2
$61 - $70,126,5.1
$0,119,4.8
$101 - $125,98,3.9


In [43]:
# How many times do you make an *online* yarn related purchase each year?
question = kp_responses.columns[23]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How many times do you make an *online* yarn related purchase each year? (Sample Size: n = 2430)


Unnamed: 0_level_0,Total,Percentage
How many times do you make an *online* yarn related purchase each year?,Unnamed: 1_level_1,Unnamed: 2_level_1
10+,452,18.6
4,362,14.9
3,350,14.4
2,296,12.2
6,279,11.5
5,269,11.1
1,147,6.0
8,126,5.2
7,68,2.8
0,55,2.3


In [44]:
# How much do you typically spend per *online* purchase?
question = kp_responses.columns[24]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How much do you typically spend per *online* purchase? (Sample Size: n = 2485)


Unnamed: 0_level_0,Total,Percentage
How much do you typically spend per *online* purchase?,Unnamed: 1_level_1,Unnamed: 2_level_1
$41 - $50,356,14.3
$51 - $60,344,13.8
$91 - $100,334,13.4
$71 - $80,245,9.9
$31 - $40,205,8.2
$61 - $70,199,8.0
$101 - $125,185,7.4
$21 - $30,150,6.0
$126 - $150,120,4.8
$81 - $90,96,3.9


In [45]:
# Where did you receive this survey from?
question = kp_responses.columns[25]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Where did you receive this survey from? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Where did you receive this survey from?,Unnamed: 1_level_1,Unnamed: 2_level_1
Knit Picks,2673,100.0


In [46]:
# How did you first hear about our brand?
question = kp_responses.columns[26]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(6))

Question: How did you first hear about our brand? (Sample Size: n = 2538)


Unnamed: 0_level_0,Total,Percentage
How did you first hear about our brand?,Unnamed: 1_level_1,Unnamed: 2_level_1
"Word of Mouth (Friends/Family, Colleagues)",786,31.0
"Online Ads (Search engines, Sponsored content)",524,20.6
"Social Media (Instagram, Facebook, Pinterest, etc.)",512,20.2
"Email (Promotional, Newsletter)",332,13.1
"Events (In-store, Trade shows, Workshops)",45,1.8
Ravelry,20,0.8


In [47]:
# How did you first hear about our brand?
question = kp_responses.columns[27]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(8))

Question: Which social media platform? (Sample Size: n = 508)


Unnamed: 0_level_0,Total,Percentage
Which social media platform?,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,169,33.3
Instagram,103,20.3
Ravelry,93,18.3
YouTube,78,15.4
Pinterest,32,6.3
Reddit,12,2.4
Blogs,3,0.6
TikTok,2,0.4


In [48]:
# Which brands have you heard of?

# Print statement question.
question = kp_responses.columns[28]

# Split previous concatenation.
split_brands = kp_responses.iloc[:, 28].str.split(', ')

# Calculate the total number of rows before exploding.
sample_size = kp_responses.iloc[:, 28].notnull().sum()

# Create a temporary DataFrame for exploding, to avoid modifying the original
temp_df = pd.DataFrame({'SplitBrands': split_brands})
temp_df = temp_df.explode('SplitBrands')

# Filter out rows where 'SplitBrands' is empty or contains only whitespace
temp_df = temp_df[temp_df['SplitBrands'].str.strip().astype(bool)]

# Step 3: Count occurrences of each brand
brand_counts = temp_df['SplitBrands'].value_counts().reset_index()
brand_counts.columns = ['Brand', 'Count']

# Step 4: Calculate the percentage of each brand's occurrence based on the original number of responses
brand_counts['Percentage'] = ((brand_counts['Count'] / sample_size) * 100).round(1)

print(f'Question: {question} (Sample Size: n = {sample_size})')

display(brand_counts)  

Question: Which brands have you heard of? (Sample Size: n = 2673)


Unnamed: 0,Brand,Count,Percentage
0,Knit Picks,2472,92.5
1,Lion Brand Yarn,2392,89.5
2,Red Heart,2250,84.2
3,Cascade Yarns,2227,83.3
4,Berroco,2137,79.9
5,Caron,2065,77.3
6,Malabrigo Yarn,1940,72.6
7,Rowan,1823,68.2
8,Webs (Yarn.com),1690,63.2
9,Noro,1675,62.7


In [49]:
# Which brands have you purchase from?

# Print statement question.
question = kp_responses.columns[29]

# Split previous concatenation
split_brands = kp_responses.iloc[:, 29].str.split(', ')

# Calculate the total number of rows before exploding.
sample_size = kp_responses.iloc[:, 29].notnull().sum()

# Create a temporary DataFrame for exploding, to avoid modifying the original
temp_df = pd.DataFrame({'SplitBrands': split_brands})
temp_df = temp_df.explode('SplitBrands')

# Filter out rows where 'SplitBrands' is empty or contains only whitespace
temp_df = temp_df[temp_df['SplitBrands'].str.strip().astype(bool)]

# Count occurrences of each brand
brand_counts = temp_df['SplitBrands'].value_counts().reset_index()
brand_counts.columns = ['Brand', 'Count']

# Calculate the percentage of each brand's occurrence based on the original number of responses
brand_counts['Percentage'] = ((brand_counts['Count'] / sample_size) * 100).round(1)

print(f'Question: {question} (Sample Size: n = {sample_size})')

display(brand_counts)  

Question: Which brands have you purchased from? (Sample Size: n = 2673)


Unnamed: 0,Brand,Count,Percentage
0,Knit Picks,2253,84.3
1,Lion Brand Yarn,1765,66.0
2,Cascade Yarns,1501,56.2
3,Berroco,1251,46.8
4,Malabrigo Yarn,1199,44.9
5,Caron,1162,43.5
6,Red Heart,1161,43.4
7,Webs (Yarn.com),1034,38.7
8,Rowan,822,30.8
9,MadelineTosh,806,30.2


In [50]:
# What makes our yarn stand out from competitors to you?

# Print statement question.
question = kp_responses.columns[30]
#Same size.
sample_size = kp_responses.iloc[:, 30].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 30].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)
tri_grams = ngrams(all_words, 3)

# Count and find the top 10 uni, bi, and tri-grams,
top_unigrams = Counter(all_words).most_common(10)
top_bi_grams = Counter(bi_grams).most_common(10)
top_tri_grams = Counter(tri_grams).most_common(10)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

# Print the top 10 single word occurrences
print("Top 10 Single Word Occurrences:")
for word, count in top_unigrams:
    print(f"{word}: {count}")

# Add a line break
print("\n")

print("Top 10 Bi-grams:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")

# Add a line break
print("\n")

print("\nTop 10 Tri-grams:")
for gram, count in top_tri_grams:
    print(f"{gram}: {count}")

Question: What makes our yarn stand out from competitors to you? (Sample Size: n = 2418)


Top 10 Single Word Occurrences:
qualiti: 1045
price: 1021
color: 497
yarn: 462
good: 389
varieti: 296
select: 257
great: 144
valu: 142
fiber: 139


Top 10 Bi-grams:
('qualiti', 'price'): 227
('price', 'qualiti'): 212
('good', 'qualiti'): 153
('reason', 'price'): 86
('good', 'price'): 80
('price', 'price'): 74
('qualiti', 'yarn'): 69
('qualiti', 'good'): 69
('price', 'good'): 66
('qualiti', 'color'): 63



Top 10 Tri-grams:
('price', 'qualiti', 'price'): 38
('price', 'good', 'qualiti'): 33
('qualiti', 'price', 'qualiti'): 31
('qualiti', 'reason', 'price'): 30
('good', 'qualiti', 'price'): 30
('qualiti', 'good', 'price'): 28
('price', 'price', 'qualiti'): 28
('qualiti', 'price', 'price'): 24
('price', 'qualiti', 'qualiti'): 22
('price', 'qualiti', 'varieti'): 18


In [51]:
# Compared to other brands, how do you rate the price of our products?
question = kp_responses.columns[31]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Compared to other brands, how do you rate the price of our products? (Sample Size: n = 2521)


Unnamed: 0_level_0,Total,Percentage
"Compared to other brands, how do you rate the price of our products?",Unnamed: 1_level_1,Unnamed: 2_level_1
Somewhat Lower,1147,45.5
About the Same,849,33.7
Somewhat Higher,277,11.0
Significantly Lower,228,9.0
Significantly Higher,20,0.8


In [52]:
# Where do you prefer to explore or learn about new yarn products?
question = kp_responses.columns[32]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: Where do you prefer to explore or learn about new yarn products? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Where do you prefer to explore or learn about new yarn products?,Unnamed: 1_level_1,Unnamed: 2_level_1
Online stores,703,26.3
No Preference,663,24.8
"Fiber festivals, craft fairs, and events",438,16.4
Social media,425,15.9
Local yarn shop,389,14.6
Knitting/crochet workshops,55,2.1


In [53]:
# Which social media platform do you prefer to learn about new yarn products?
question = kp_responses.columns[33]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(8))

Question: Which social media platform?.1 (Sample Size: n = 405)


Unnamed: 0_level_0,Total,Percentage
Which social media platform?.1,Unnamed: 1_level_1,Unnamed: 2_level_1
Instagram,134,33.1
YouTube,94,23.2
Ravelry,80,19.8
Facebook,70,17.3
Pinterest,14,3.5
TikTok,3,0.7
Twitter,2,0.5
Reddit,2,0.5


In [54]:
# When shopping online, what encourages you to try *new* yarn products?
question = kp_responses.columns[34]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(6))

Question: When shopping online, what encourages you to try *new* yarn products? (Sample Size: n = 2509)


Unnamed: 0_level_0,Total,Percentage
"When shopping online, what encourages you to try *new* yarn products?",Unnamed: 1_level_1,Unnamed: 2_level_1
Detailed product descriptions,930,37.1
Customer reviews,612,24.4
High-quality images,366,14.6
Free samples or discounts,348,13.9
Recommendations from influencers,139,5.5
All of the above,3,0.1


In [55]:
# What products do you primarily associate with our brand?

# Print statement question.
question = kp_responses.columns[35]
#Same size.
sample_size = kp_responses.iloc[:, 35].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 35].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)
tri_grams = ngrams(all_words, 3)

# Count and find the top 10 uni, bi, and tri-grams,
top_unigrams = Counter(all_words).most_common(10)
top_bi_grams = Counter(bi_grams).most_common(10)
top_tri_grams = Counter(tri_grams).most_common(10)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

# Print the top 10 single word occurrences
print("Top 10 Single Word Occurrences:")
for word, count in top_unigrams:
    print(f"{word}: {count}")

# Add a line break
print("\n")

print("Top 10 Two Word Occurrences:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")

# Add a line break
print("\n")

print("\nTop 10 Three Word Occurrences:")
for gram, count in top_tri_grams:
    print(f"{gram}: {count}")

Question: What products do you primarily associate with our brand? (Sample Size: n = 2451)


Top 10 Single Word Occurrences:
yarn: 1927
everyday: 1010
luxuri: 386
qualiti: 288
wool: 187
good: 165
needl: 144
sock: 115
everi: 110
day: 103


Top 10 Two Word Occurrences:
('everyday', 'yarn'): 826
('yarn', 'everyday'): 506
('luxuri', 'yarn'): 254
('yarn', 'luxuri'): 178
('qualiti', 'yarn'): 128
('everi', 'day'): 103
('sock', 'yarn'): 90
('yarn', 'good'): 80
('day', 'yarn'): 76
('good', 'qualiti'): 73



Top 10 Three Word Occurrences:
('yarn', 'everyday', 'yarn'): 415
('everyday', 'yarn', 'everyday'): 244
('yarn', 'luxuri', 'yarn'): 119
('everyday', 'yarn', 'luxuri'): 97
('everi', 'day', 'yarn'): 76
('luxuri', 'yarn', 'everyday'): 75
('yarn', 'everi', 'day'): 49
('luxuri', 'everyday', 'yarn'): 42
('everyday', 'yarn', 'good'): 38
('yarn', 'sock', 'yarn'): 37


In [56]:
#  Which of our yarn fibers do you frequently purchase?

# Print statement question.
question = kp_responses.columns[36]

# Remove parenthesis, they have commas.
preprocessed = kp_responses.iloc[:, 36].str.replace(r"\s*\([^)]*\)", "", regex=True)

# Split previous concatenation.
split_options = preprocessed.str.split(', ')

# Calculate the total number of rows before exploding.
sample_size = kp_responses.iloc[:, 36].notnull().sum()

# Create a temporary DataFrame for exploding, to avoid modifying the original
temp_df = pd.DataFrame({'Split': split_options})
temp_df = temp_df.explode('Split')

# Filter out rows where 'SplitBrands' is empty or contains only whitespace
temp_df = temp_df[temp_df['Split'].str.strip().astype(bool)]

# Step 3: Count occurrences of each brand
option_counts = temp_df['Split'].value_counts().reset_index()
option_counts.columns = ['Options', 'Count']

# Step 4: Calculate the percentage of each brand's occurrence based on the original number of responses
option_counts['Percentage'] = ((option_counts['Count'] / sample_size) * 100).round(1)

print(f'Question: {question} (Sample Size: n = {sample_size})')

display(option_counts.head(5))  

Question: Which of our yarn fibers do you frequently purchase? (Sample Size: n = 2673)


Unnamed: 0,Options,Count,Percentage
0,Wool yarns,1847,69.1
1,Blended yarns,1426,53.3
2,Cotton yarns,864,32.3
3,Synthetic yarns,622,23.3
4,Luxury yarns,603,22.6


In [57]:
# Which types of projects do you typically buy our yarn for?

# Print statement question.
question = kp_responses.columns[37]

# Remove parenthesis, they have commas.
preprocessed = kp_responses.iloc[:, 37].str.replace(r"\s*\([^)]*\)", "", regex=True)

# Split previous concatenation.
split_options = preprocessed.str.split(', ')

# Calculate the total number of rows before exploding.
sample_size = kp_responses.iloc[:, 37].notnull().sum()

# Create a temporary DataFrame for exploding, to avoid modifying the original
temp_df = pd.DataFrame({'Split': split_options})
temp_df = temp_df.explode('Split')

# Filter out rows where 'SplitBrands' is empty or contains only whitespace
temp_df = temp_df[temp_df['Split'].str.strip().astype(bool)]

# Step 3: Count occurrences of each brand
option_counts = temp_df['Split'].value_counts().reset_index()
option_counts.columns = ['Options', 'Count']

# Step 4: Calculate the percentage of each brand's occurrence based on the original number of responses
option_counts['Percentage'] = ((option_counts['Count'] / sample_size) * 100).round(1)

print(f'Question: {question} (Sample Size: n = {sample_size})')

display(option_counts.head(5))  

Question: Which types of projects do you typically buy our yarn for? (Sample Size: n = 2673)


Unnamed: 0,Options,Count,Percentage
0,Clothing items,2008,75.1
1,Accessories,1798,67.3
2,Baby,808,30.2
3,Home decor,793,29.7
4,Toys and amigurumi,340,12.7


In [58]:
# Can you briefly explain why you buy these yarn fibers for the selected project types?

# Print statement question.
question = kp_responses.columns[38]
#Same size.
sample_size = kp_responses.iloc[:, 38].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 38].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)
tri_grams = ngrams(all_words, 3)

# Count and find the top 10 uni, bi, and tri-grams,
top_unigrams = Counter(all_words).most_common(10)
top_bi_grams = Counter(bi_grams).most_common(10)
top_tri_grams = Counter(tri_grams).most_common(10)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

# Print the top 10 single word occurrences
print("Top 10 Single Word Occurrences:")
for word, count in top_unigrams:
    print(f"{word}: {count}")

# Line break
print("\n")

print("Top 10 Two Word Occurrences:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")

# Add a line break
print("\n")

print("\nTop 10 Three Word Occurrences:")
for gram, count in top_tri_grams:
    print(f"{gram}: {count}")

Question: Can you briefly explain why you buy these yarn fibers for the selected project types? (Sample Size: n = 1614)


Top 10 Single Word Occurrences:
yarn: 454
wool: 357
like: 290
fiber: 216
qualiti: 214
knit: 196
project: 173
color: 169
natur: 158
use: 154


Top 10 Two Word Occurrences:
('natur', 'fiber'): 111
('prefer', 'natur'): 45
('good', 'qualiti'): 39
('wool', 'blend'): 35
('qualiti', 'yarn'): 33
('easi', 'care'): 31
('natur', 'fibr'): 25
('like', 'wool'): 25
('sock', 'yarn'): 21
('like', 'natur'): 21



Top 10 Three Word Occurrences:
('prefer', 'natur', 'fiber'): 35
('like', 'natur', 'fiber'): 16
('wool', 'wool', 'blend'): 13
('use', 'natur', 'fiber'): 12
('natur', 'fiber', 'like'): 9
('like', 'work', 'wool'): 7
('wool', 'natur', 'fiber'): 6
('prefer', 'natur', 'fibr'): 6
('work', 'natur', 'fiber'): 6
('high', 'qualiti', 'yarn'): 5


In [59]:
# How important is sustainability in the yarns you choose?
question = kp_responses.columns[39]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: How important is sustainability in the yarns you choose? (Sample Size: n = 2551)


Unnamed: 0_level_0,Total,Percentage
How important is sustainability in the yarns you choose?,Unnamed: 1_level_1,Unnamed: 2_level_1
Somewhat important,1086,42.6
Very important,755,29.6
Extremely important,368,14.4
Not very important,260,10.2
Not important at all,82,3.2


In [60]:
# Which social media platforms do you normally get project ideas from?

# Print statement question.
question = kp_responses.columns[40]

# Remove parenthesis, they have commas.
preprocessed = kp_responses.iloc[:, 40].str.replace(r"\s*\([^)]*\)", "", regex=True)

# Split previous concatenation.
split_options = preprocessed.str.split(', ')

# Calculate the total number of rows before exploding.
sample_size = kp_responses.iloc[:, 40].notnull().sum()

# Create a temporary DataFrame for exploding, to avoid modifying the original
temp_df = pd.DataFrame({'Split': split_options})
temp_df = temp_df.explode('Split')

# Filter out rows where 'SplitBrands' is empty or contains only whitespace
temp_df = temp_df[temp_df['Split'].str.strip().astype(bool)]

# Step 3: Count occurrences of each brand
option_counts = temp_df['Split'].value_counts().reset_index()
option_counts.columns = ['Options', 'Count']

# Step 4: Calculate the percentage of each brand's occurrence based on the original number of responses
option_counts['Percentage'] = ((option_counts['Count'] / sample_size) * 100).round(1)

print(f'Question: {question} (Sample Size: n = {sample_size})')

display(option_counts.head(10)) 

Question: Which social media platforms do you normally get project ideas from? (Sample Size: n = 2673)


Unnamed: 0,Options,Count,Percentage
0,Ravelry,2017,75.5
1,Pinterest,827,30.9
2,Instagram,824,30.8
3,YouTube,732,27.4
4,Facebook,678,25.4
5,I don't use social media.,190,7.1
6,TikTok,69,2.6
7,Reddit,18,0.7
8,Twitter,17,0.6
9,Etsy,11,0.4


In [61]:
# Who is your favorite content creator or designer you follow?

# Print statement question.
question = kp_responses.columns[41]
#Same size.
sample_size = kp_responses.iloc[:, 41].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 41].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)

# Count and find the top 10 uni, bi, and tri-grams,
top_bi_grams = Counter(bi_grams).most_common(20)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

print("Top 20 Two Word Occurrences:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")


Question: Who is your favorite content creator or designer you follow? (Sample Size: n = 1791)


Top 20 Two Word Occurrences:
('andrea', 'mowri'): 81
('stephen', 'west'): 40
('crazi', 'sock'): 32
('sock', 'ladi'): 32
('marli', 'bird'): 32
('tin', 'knit'): 30
('follow', 'anyon'): 29
('bag', 'day'): 26
('one', 'particular'): 21
('isabel', 'kraemer'): 19
('pink', 'knit'): 19
('joji', 'locatelli'): 18
('fiber', 'art'): 17
('petit', 'knit'): 16
('patti', 'lyon'): 16
('summer', 'lee'): 16
('tl', 'yarn'): 16
('yarn', 'craft'): 16
('arn', 'carlo'): 16
('nimbl', 'needl'): 15


In [62]:
# How many projects do you start in a year?
question = kp_responses.columns[42]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How many projects do you start in a year? (Sample Size: n = 2530)


Unnamed: 0_level_0,Total,Percentage
How many projects do you start in a year?,Unnamed: 1_level_1,Unnamed: 2_level_1
10.0,343,13.6
20.0,283,11.2
6.0,230,9.1
5.0,223,8.8
12.0,208,8.2
15.0,181,7.2
4.0,155,6.1
8.0,146,5.8
3.0,146,5.8
25.0,90,3.6


In [63]:
# How many projects do you complete in a year?
question = kp_responses.columns[43]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How many projects do you complete in a year?  (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
How many projects do you complete in a year?,Unnamed: 1_level_1,Unnamed: 2_level_1
10.0,291,10.9
6.0,213,8.0
4.0,210,7.9
20.0,208,7.8
5.0,205,7.7
3.0,190,7.1
2.0,184,6.9
8.0,177,6.6
15.0,161,6.0
12.0,160,6.0


In [64]:
# How do you feel about the balance between classic and trendy yarns in our selection? 1 Classic , 5 Trendy.
question = kp_responses.columns[44]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How do you feel about the balance between classic and trendy yarns in our selection? (Sample Size: n = 2484)


Unnamed: 0_level_0,Total,Percentage
How do you feel about the balance between classic and trendy yarns in our selection?,Unnamed: 1_level_1,Unnamed: 2_level_1
3.0,1971,79.3
2.0,237,9.5
4.0,149,6.0
1.0,87,3.5
5.0,40,1.6


In [65]:
# How often is something you want from our website out-of-stock?
question = kp_responses.columns[45]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How often is something you want from our website out-of-stock? (Sample Size: n = 2455)


Unnamed: 0_level_0,Total,Percentage
How often is something you want from our website out-of-stock?,Unnamed: 1_level_1,Unnamed: 2_level_1
Occasionally,1160,47.3
Rarely,969,39.5
Never,193,7.9
Frequently,116,4.7
Very frequently,17,0.7


In [66]:
# How effective is our communication when items are restocked?
question = kp_responses.columns[46]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How effective is our communication when items are restocked? (Sample Size: n = 2239)


Unnamed: 0_level_0,Total,Percentage
How effective is our communication when items are restocked?,Unnamed: 1_level_1,Unnamed: 2_level_1
Neutral,1046,46.7
Effective,828,37.0
Very effective,278,12.4
Ineffective,66,2.9
Very ineffective,21,0.9


In [67]:
# Have you been able to find everything you need for a project on our website?
question = kp_responses.columns[47]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Have you been able to find everything you need for a project on our website? (Sample Size: n = 2463)


Unnamed: 0_level_0,Total,Percentage
Have you been able to find everything you need for a project on our website?,Unnamed: 1_level_1,Unnamed: 2_level_1
Most Of The Time,1723,70.0
Always,387,15.7
Sometimes,333,13.5
Rarely,11,0.4
Never,9,0.4


In [68]:
# Are there any yarn related products you think we should start carrying?

# Print statement question.
question = kp_responses.columns[48]
#Same size.
sample_size = kp_responses.iloc[:, 48].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 48].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)
tri_grams = ngrams(all_words, 3)

# Count and find the top 10 uni, bi, and tri-grams,
top_unigrams = Counter(all_words).most_common(10)
top_bi_grams = Counter(bi_grams).most_common(10)
top_tri_grams = Counter(tri_grams).most_common(10)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

# Print the top 10 single word occurrences
print("Top 10 Single Word Occurrences:")
for word, count in top_unigrams:
    print(f"{word}: {count}")

# Line break
print("\n")

print("Top 10 Two Word Occurrences:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")

# Add a line break
print("\n")

print("\nTop 10 Three Word Occurrences:")
for gram, count in top_tri_grams:
    print(f"{gram}: {count}")

Question: Are there any yarn related products you think we should start carrying? (Sample Size: n = 569)


Top 10 Single Word Occurrences:
yarn: 148
needl: 64
color: 53
like: 42
wool: 40
weight: 34
would: 33
bag: 30
think: 28
project: 24


Top 10 Two Word Occurrences:
('project', 'bag'): 17
('sock', 'yarn'): 15
('stitch', 'marker'): 13
('would', 'love'): 11
('dk', 'weight'): 10
('circular', 'needl'): 9
('like', 'see'): 8
('love', 'see'): 8
('crochet', 'hook'): 8
('ca', 'think'): 8



Top 10 Three Word Occurrences:
('would', 'like', 'see'): 5
('would', 'love', 'see'): 5
('super', 'bulki', 'yarn'): 4
('doubl', 'point', 'needl'): 4
('dk', 'weight', 'yarn'): 3
('hand', 'dy', 'yarn'): 3
('bulki', 'super', 'bulki'): 2
('color', 'yarn', 'like'): 2
('multi', 'color', 'yarn'): 2
('yarn', 'like', 'express'): 2


In [69]:
# How easy do you find navigating our website to browse products?
question = kp_responses.columns[49]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How easy do you find navigating our website to browse products? (Sample Size: n = 2514)


Unnamed: 0_level_0,Total,Percentage
How easy do you find navigating our website to browse products?,Unnamed: 1_level_1,Unnamed: 2_level_1
Very Easy,1399,55.6
Somewhat Easy,760,30.2
Neutral,311,12.4
Somewhat Difficult,43,1.7
Very Difficult,1,0.0


In [70]:
# Would you recommend any changes to our website to make browsing easier?

# Print statement question.
question = kp_responses.columns[50]
#Same size.
sample_size = kp_responses.iloc[:, 50].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 50].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)
tri_grams = ngrams(all_words, 3)

# Count and find the top 10 uni, bi, and tri-grams,
top_unigrams = Counter(all_words).most_common(10)
top_bi_grams = Counter(bi_grams).most_common(10)
top_tri_grams = Counter(tri_grams).most_common(10)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

# Print the top 10 single word occurrences
print("Top 10 Single Word Occurrences:")
for word, count in top_unigrams:
    print(f"{word}: {count}")

# Line break
print("\n")

print("Top 10 Two Word Occurrences:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")

# Add a line break
print("\n")

print("\nTop 10 Three Word Occurrences:")
for gram, count in top_tri_grams:
    print(f"{gram}: {count}")

Question: Would you recommend any changes to our website to make browsing easier? (Sample Size: n = 556)


Top 10 Single Word Occurrences:
yarn: 201
filter: 86
color: 82
weight: 69
search: 57
like: 50
would: 39
look: 37
see: 36
type: 35


Top 10 Two Word Occurrences:
('yarn', 'weight'): 36
('weight', 'fiber'): 13
('filter', 'yarn'): 11
('yarn', 'color'): 11
('yarn', 'line'): 11
('would', 'nice'): 10
('yarn', 'type'): 9
('select', 'yarn'): 9
('would', 'like'): 9
('better', 'filter'): 9



Top 10 Three Word Occurrences:
('find', 'yarn', 'weight'): 4
('filter', 'yarn', 'weight'): 4
('yarn', 'weight', 'fiber'): 4
('weight', 'fiber', 'content'): 4
('base', 'yarn', 'weight'): 3
('sort', 'sale', 'item'): 3
('sale', 'item', 'yarn'): 3
('would', 'like', 'abl'): 3
('type', 'yarn', 'weight'): 3
('abl', 'search', 'weight'): 3


In [71]:
# How would you rate the organization and categorization of products on our site?
question = kp_responses.columns[51]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How would you rate the organization and categorization of products on our site? (Sample Size: n = 2505)


Unnamed: 0_level_0,Total,Percentage
How would you rate the organization and categorization of products on our site?,Unnamed: 1_level_1,Unnamed: 2_level_1
Good,1239,49.5
Excellent,952,38.0
Average,299,11.9
Below Average,15,0.6


In [72]:
# Do you prefer broader categories with more products or more specific categories with fewer products?
question = kp_responses.columns[52]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Do you prefer broader categories with more products or more specific categories with fewer products? (Sample Size: n = 2481)


Unnamed: 0_level_0,Total,Percentage
Do you prefer broader categories with more products or more specific categories with fewer products?,Unnamed: 1_level_1,Unnamed: 2_level_1
A balance of both,1218,49.1
More specific categories with fewer products,631,25.4
Broader categories with more products,358,14.4
No preference,274,11.0


In [73]:
# Have you received one of our catalogs before?
question = kp_responses.columns[53]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Have you received one of our catalogs before? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Have you received one of our catalogs before?,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,2017,75.5
No,656,24.5


In [74]:
# Do you enjoy receiving our product catalogs?
question = kp_responses.columns[54]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Do you enjoy receiving our product catalogs? (Sample Size: n = 1880)


Unnamed: 0_level_0,Total,Percentage
Do you enjoy receiving our product catalogs?,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,1722,91.6
No,158,8.4


In [75]:
# How often do our catalogs introduce you to products you weren't aware of before?
question = kp_responses.columns[55]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How often do our catalogs introduce you to products you weren't aware of before? (Sample Size: n = 1892)


Unnamed: 0_level_0,Total,Percentage
How often do our catalogs introduce you to products you weren't aware of before?,Unnamed: 1_level_1,Unnamed: 2_level_1
Sometimes,770,40.7
Often,690,36.5
Always,231,12.2
Rarely,157,8.3
Never,44,2.3


In [76]:
# Have you ever made a purchase after seeing a product in our catalog?
question = kp_responses.columns[56]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Have you ever made a purchase after seeing a product in our catalog? (Sample Size: n = 1880)


Unnamed: 0_level_0,Total,Percentage
Have you ever made a purchase after seeing a product in our catalog?,Unnamed: 1_level_1,Unnamed: 2_level_1
"Yes, multiple times",816,43.4
"Yes, once",667,35.5
No,397,21.1


In [77]:
# Are you concerned about the environmental impact of receiving physical catalogs?
question = kp_responses.columns[57]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Are you concerned about the environmental impact of receiving physical catalogs? (Sample Size: n = 1894)


Unnamed: 0_level_0,Total,Percentage
Are you concerned about the environmental impact of receiving physical catalogs?,Unnamed: 1_level_1,Unnamed: 2_level_1
Not concerned,858,45.3
Somewhat concerned,815,43.0
Very concerned,221,11.7


In [78]:
# Do you prefer receiving our catalogs digitally or in a physical format? 
question = kp_responses.columns[58]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: Do you prefer receiving our catalogs digitally or in a physical format? (Sample Size: n = 1894)


Unnamed: 0_level_0,Total,Percentage
Do you prefer receiving our catalogs digitally or in a physical format?,Unnamed: 1_level_1,Unnamed: 2_level_1
Physically,1055,55.7
Digitally,481,25.4
No preference,358,18.9


In [79]:
# What improvements would you suggest for our future catalogs?

# Print statement question.
question = kp_responses.columns[59]
#Same size.
sample_size = kp_responses.iloc[:, 59].notnull().sum()

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

#Case sensitive, cast lower case. 
process_data = kp_responses.iloc[:, 59].dropna().astype(str).str.lower()  # Lowercase conversion

def clean_text(text):
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Apply cleaning function
tokenized_data = process_data.apply(clean_text)

# Apply stemming
stemmed_data = tokenized_data.apply(lambda x: [stemmer.stem(word) for word in x])

# Flatten the list of lists for n-grams analysis
all_words = [word for text in stemmed_data for word in text]

# Generate bi-grams and tri-grams
bi_grams = ngrams(all_words, 2)
tri_grams = ngrams(all_words, 3)

# Count and find the top 10 uni, bi, and tri-grams,
top_unigrams = Counter(all_words).most_common(10)
top_bi_grams = Counter(bi_grams).most_common(10)
top_tri_grams = Counter(tri_grams).most_common(10)

#Question statement
print(f'Question: {question} (Sample Size: n = {sample_size})')

# Line break
print("\n")

# Print the top 10 single word occurrences
print("Top 10 Single Word Occurrences:")
for word, count in top_unigrams:
    print(f"{word}: {count}")

# Line break
print("\n")

print("Top 10 Two Word Occurrences:")
for gram, count in top_bi_grams:
    print(f"{gram}: {count}")

# Add a line break
print("\n")

print("\nTop 10 Three Word Occurrences:")
for gram, count in top_tri_grams:
    print(f"{gram}: {count}")

Question: What improvements would you suggest for our future catalogs? (Sample Size: n = 374)


Top 10 Single Word Occurrences:
catalog: 116
yarn: 91
pattern: 76
color: 65
like: 63
would: 50
see: 43
project: 37
product: 36
paper: 30


Top 10 Two Word Occurrences:
('free', 'pattern'): 23
('paper', 'catalog'): 16
('like', 'see'): 15
('would', 'like'): 11
('physic', 'catalog'): 10
('receiv', 'catalog'): 8
('look', 'catalog'): 6
('recycl', 'paper'): 6
('love', 'see'): 6
('see', 'color'): 6



Top 10 Three Word Occurrences:
('would', 'like', 'see'): 6
('includ', 'free', 'pattern'): 5
('add', 'free', 'pattern'): 4
('use', 'recycl', 'paper'): 3
('enjoy', 'look', 'catalog'): 3
('like', 'receiv', 'catalog'): 3
('pleas', 'keep', 'come'): 2
('mayb', 'free', 'pattern'): 2
('side', 'side', 'comparison'): 2
('would', 'love', 'see'): 2


In [80]:
# Have you ever contacted our customer service before? 
question = kp_responses.columns[60]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(5))

Question: Have you ever contacted our customer service before? (Sample Size: n = 2673)


Unnamed: 0_level_0,Total,Percentage
Have you ever contacted our customer service before?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1740,65.1
Yes,933,34.9


In [81]:
# How would you rate the speed of our response to inquiries or issues?
question = kp_responses.columns[61]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(5))

Question: How would you rate the speed of our response to inquiries or issues? (Sample Size: n = 876)


Unnamed: 0_level_0,Total,Percentage
How would you rate the speed of our response to inquiries or issues?,Unnamed: 1_level_1,Unnamed: 2_level_1
Fast,392,44.7
Very Fast,293,33.4
Average,157,17.9
Slow,23,2.6
Very Slow,11,1.3


In [82]:
# How helpful do you find our customer service team?
question = kp_responses.columns[62]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(5))

Question: How helpful do you find our customer service team? (Sample Size: n = 874)


Unnamed: 0_level_0,Total,Percentage
How helpful do you find our customer service team?,Unnamed: 1_level_1,Unnamed: 2_level_1
Very Helpful,509,58.2
Helpful,279,31.9
Somewhat Helpful,40,4.6
Neutral,37,4.2
Not Helpful,9,1.0


In [83]:
# How well did our customer service team understand the products related to your inquiry?
question = kp_responses.columns[63]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(5))

Question: How well did our customer service team understand the products related to your inquiry? (Sample Size: n = 876)


Unnamed: 0_level_0,Total,Percentage
How well did our customer service team understand the products related to your inquiry?,Unnamed: 1_level_1,Unnamed: 2_level_1
Completely,426,48.6
Very Well,388,44.3
Moderately,49,5.6
Slightly,8,0.9
Not at All,5,0.6


In [84]:
# Overall, are you satisfied with the resolution our customer service provided?
question = kp_responses.columns[64]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(5))

Question: Overall, are you satisfied with the resolution our customer service provided? (Sample Size: n = 876)


Unnamed: 0_level_0,Total,Percentage
"Overall, are you satisfied with the resolution our customer service provided?",Unnamed: 1_level_1,Unnamed: 2_level_1
Very Satisfied,575,65.6
Satisfied,231,26.4
Neutral,33,3.8
Very Dissatisfied,24,2.7
Dissatisfied,13,1.5


In [85]:
# What is one thing we could improve about our customer service?
question = kp_responses.columns[65]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(5))

Question: What is one thing we could improve about our customer service? (Sample Size: n = 426)


Unnamed: 0_level_0,Total,Percentage
What is one thing we could improve about our customer service?,Unnamed: 1_level_1,Unnamed: 2_level_1
Nothing,40,9.4
nothing,9,2.1
Can’t think of anything,4,0.9
Nothing!,3,0.7
Not sure,3,0.7


In [86]:
# On a scale of 0-10, how likely are you to recommend us to a friend or family member?
question = kp_responses.columns[66]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: On a scale of 0-10, how likely are you to recommend us to a friend or family member? (Sample Size: n = 2470)


Unnamed: 0_level_0,Total,Percentage
"On a scale of 0-10, how likely are you to recommend us to a friend or family member?",Unnamed: 1_level_1,Unnamed: 2_level_1
10.0,1298,52.6
8.0,411,16.6
9.0,400,16.2
7.0,161,6.5
5.0,101,4.1
6.0,68,2.8
4.0,13,0.5
0.0,8,0.3
3.0,8,0.3
2.0,2,0.1


In [87]:
# What is your current age?
question = kp_responses.columns[67]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: What is your current age?   (Sample Size: n = 2369)


Unnamed: 0_level_0,Total,Percentage
What is your current age?,Unnamed: 1_level_1,Unnamed: 2_level_1
65-74,682,28.8
55-64,553,23.3
45-54,344,14.5
35-44,314,13.3
75 and older,236,10.0
25-34,168,7.1
Prefer not to answer,45,1.9
18-24,23,1.0
Under 18,4,0.2


In [88]:
# How do you identify? 
question = kp_responses.columns[68]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: How do you identify?    (Sample Size: n = 2409)


Unnamed: 0_level_0,Total,Percentage
How do you identify?,Unnamed: 1_level_1,Unnamed: 2_level_1
Woman,2262,93.9
Prefer not to answer,82,3.4
Non-Conforming / Non-Binary,31,1.3
Man,30,1.2
Transgender,4,0.2


In [89]:
# What is your employment status?
question = kp_responses.columns[69]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: What is your employment status?     (Sample Size: n = 2436)


Unnamed: 0_level_0,Total,Percentage
What is your employment status?,Unnamed: 1_level_1,Unnamed: 2_level_1
Retired,1068,43.8
Employed full-time,780,32.0
Employed part-time,237,9.7
Full-time homemaker,180,7.4
Prefer not to answer,100,4.1
Student,32,1.3
Looking for a job,22,0.9
Under-employed,17,0.7


In [90]:
# What is your highest level of education? 
question = kp_responses.columns[70]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: What is your highest level of education?     (Sample Size: n = 2452)


Unnamed: 0_level_0,Total,Percentage
What is your highest level of education?,Unnamed: 1_level_1,Unnamed: 2_level_1
Bachelor’s degree (4-year),843,34.4
Master’s degree,540,22.0
Some college (no degree),333,13.6
Associate degree (2-year),218,8.9
High school (including GED),143,5.8
Technical certification,104,4.2
Doctoral degree,94,3.8
"Professional degree (JD, MD)",89,3.6
Prefer not to answer,76,3.1
Less than High School,12,0.5


In [91]:
# What is your marital status?
question = kp_responses.columns[71]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary.head(10))

Question: What is your marital status?    (Sample Size: n = 2448)


Unnamed: 0_level_0,Total,Percentage
What is your marital status?,Unnamed: 1_level_1,Unnamed: 2_level_1
Married,1635,66.8
Single,335,13.7
Divorced,185,7.6
Widowed,148,6.0
Prefer not to answer,132,5.4
Separated,13,0.5


In [92]:
# What is your total household income in U.S. dollars?
question = kp_responses.columns[72]
summary, sample_size = summarize_column(kp_responses, question)

print(f'Question: {question} (Sample Size: n = {sample_size})')
display(summary)

Question: What is your total household income in U.S. dollars?   (Sample Size: n = 2524)


Unnamed: 0_level_0,Total,Percentage
What is your total household income in U.S. dollars?,Unnamed: 1_level_1,Unnamed: 2_level_1
Prefer not to answer,884,35.0
"$100,000 - $124,999",236,9.4
"$50,000 - $64,999",224,8.9
"$80,000 - $99,999",214,8.5
"$65,000 - $79,999",197,7.8
"$35,000 - $49,999",191,7.6
"$20,000 - $34,999",130,5.2
"$125,000 - $149,999",117,4.6
"Over $200,000",113,4.5
"$150,000 - $174,999",86,3.4


# Survey End