In [1]:
import pandas as pd
data = pd.read_csv('thanksgiving.csv', encoding="Latin-1")
print(data.head())

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   
3    4337933040                            Yes   
4    4337931983                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             
3                                             Turkey             
4                                           Tofurkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                            

In [2]:
print(data.columns)

Index(['RespondentID', 'Do you celebrate Thanksgiving?',
       'What is typically the main dish at your Thanksgiving dinner?',
       'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       'How is the main dish typically cooked?',
       'How is the main dish typically cooked? - Other (please specify)',
       'What kind of stuffing/dressing do you typically have?',
       'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       'What type of cranberry saucedo you typically have?',
       'What type of cranberry saucedo you typically have? - Other (please specify)',
       'Do you typically have gravy?',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       'Which of these side dishes aretypically served

In [3]:
# pandas.Series.value_counts()

col = 'Do you celebrate Thanksgiving?'
print(data[col].value_counts())
print(data[col].dtype)

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64
object


In [4]:
# remove rows which do not celebrate Thanksgiving

yes_rows = (data[col] == 'Yes')
data = data[yes_rows]
print(data[col].value_counts())

Yes    980
Name: Do you celebrate Thanksgiving?, dtype: int64


In [5]:
# main_dish pandas.Series.value_counts()
col_main_dish = 'What is typically the main dish at your Thanksgiving dinner?'
print(data[col_main_dish].value_counts())

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64


In [6]:
# Display the Do you typically have gravy? 
# column for any rows from data where the 
# What is typically the main dish at your Thanksgiving dinner? column equals Tofurkey.

rows = data[data[col_main_dish] == 'Tofurkey']
col_gravy = 'Do you typically have gravy?'
print(data[col_gravy].value_counts())

Yes    892
No      82
Name: Do you typically have gravy?, dtype: int64


In [7]:
# Analyse dessert
col_apple = 'Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple'
col_pumpkin = 'Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin'
col_pecon = 'Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan'
no_pies = pd.isnull(data[col_apple]) & pd.isnull(data[col_pumpkin]) & pd.isnull(data[col_pecon])
print(no_pies.unique())
print(no_pies.value_counts())

[False  True]
False    876
True     104
dtype: int64


In [8]:
# Analyse age
# pd.Series.describe()

def convertAge(age):
    if pd.isnull(age):
        return None
    parts = age.split()
    age = parts[0]
    if age.startswith('6'):
        age = age[:-1]
    age = int(age)
    return age
data['int_age'] = data['Age'].apply(convertAge)
data['int_age'].describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

<b>Is there anything that we should be aware of about the results or our methodology?</b>
- Should not use a single value to represent a range value
- Should not use the lowest value in the range
<br/>

<b>Is this a true depiction of the ages of survey participants?</b>
- No

In [9]:
# Analyse household income
def convertIncome(income):
    if pd.isnull(income) or income.startswith('P'):
        return None
    parts = income.split()
    income = parts[0]
    income = income.replace('$', '')
    income = income.replace(',', '')
    
    return int(income)
col_income = 'How much total combined money did all members of your HOUSEHOLD earn last year?'
data['int_income'] = data[col_income].apply(convertIncome)
data['int_income'].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

In [13]:
# Analyse correlating Travel Distance And Income
rows = data[data['int_income'] < 150000]
col = 'How far will you travel for Thanksgiving?'
print('Income under 15000:')
print(rows[col].value_counts(normalize=True))

rows= data[data['int_income'] >= 150000]
print('Income above 15000:')
print(rows[col].value_counts(normalize=True))

Income under 15000:
Thanksgiving is happening at my home--I won't travel at all                         0.407837
Thanksgiving is local--it will take place in the town I live in                     0.294630
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    0.217707
Thanksgiving is out of town and far away--I have to drive several hours or fly      0.079826
Name: How far will you travel for Thanksgiving?, dtype: float64
Income above 15000:
Thanksgiving is happening at my home--I won't travel at all                         0.471429
Thanksgiving is local--it will take place in the town I live in                     0.242857
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    0.178571
Thanksgiving is out of town and far away--I have to drive several hours or fly      0.107143
Name: How far will you travel for Thanksgiving?, dtype: float64


In [18]:
# Linking Friendship And Age
# DataFrame.pivot_table()
import numpy as np
index = ['Have you ever tried to meet up with hometown friends on Thanksgiving night?']
column = ['Have you ever attended a "Friendsgiving?"']
tb = data.pivot_table(index=index, columns=column, values='int_age', aggfunc=np.mean)
print(tb)



Have you ever attended a "Friendsgiving?"                  No        Yes
Have you ever tried to meet up with hometown fr...                      
No                                                  42.283702  37.010526
Yes                                                 41.475410  33.976744


In [19]:
index = ['Have you ever tried to meet up with hometown friends on Thanksgiving night?']
column = ['Have you ever attended a "Friendsgiving?"']
tb = data.pivot_table(index=index, columns=column, values='int_income', aggfunc=np.mean)
print(tb)


Have you ever attended a "Friendsgiving?"                     No           Yes
Have you ever tried to meet up with hometown fr...                            
No                                                  78914.549654  72894.736842
Yes                                                 78750.000000  66019.736842


In [45]:
# most common dessert
prefix = 'Which of these desserts do you typically have at Thanksgiving dinner?'
col_dessert = [c for c in list(data.columns) if c.startswith(prefix)]
def count_dessert_consumption(dessert_col):
    return dessert_col.size - sum(dessert_col.isnull())
df = data[col_dessert]
s = df.apply(count_dessert_consumption)
s = s /sum(s)
s.sort_values()

Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Blondies                    0.009434
Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Fudge                       0.025354
Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Carrot cake                 0.042453
Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Peach cobbler               0.060731
Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler               0.064858
Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Brownies                    0.075472
Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Other (please specify)      0.079009
Which of thes

In [52]:
# most common complete meal
# print(data.columns)
col = 'What is typically the main dish at your Thanksgiving dinner?'
col2 = 'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)'
print(data[col].value_counts())
print(data[col2].value_counts())

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64
Prime Rib                                                                                2
Turkey and Ham                                                                           2
seafood                                                                                  2
Turkey and Vegetarian Turkey                                                             1
Varies significantly by year as one group of people we tend to be with are vegetarian    1
salmon                                                                                   1
Restaurant with various choices.                                                         1
Turkey & Ham or Seafood Hotdish (a secret 

In [55]:
# Identify how many people work on Thanksgiving.
# print(data.columns)
col = 'Will you employer make you work on Black Friday?'
print(data[col].value_counts())

Yes              43
No               20
Doesn't apply     7
Name: Will you employer make you work on Black Friday?, dtype: int64
