In [39]:
import pandas as pd


In [40]:
df = pd.read_csv('../data/raw/survey.csv')

df_clean = df.copy()
# Identify string (object) columns except Timestamp
text_cols = [col for col in df_clean.select_dtypes(include='object') if 'timestamp' not in col.lower()]

# Clean string columns: strip, lowercase, remove punctuation except commas
for col in text_cols:
    df_clean[col] = (
        df_clean[col]
        .fillna('')
        .str.strip()
        .str.lower()
        .str.replace(r'[^\w\s,]', '', regex=True)  # Remove punctuation but keep commas
    )

# Normalize food lists in survey questions
meal_cols = [col for col in text_cols if 'dining hall' in col.lower()]
for col in meal_cols:
    df_clean[col] = df_clean[col].apply(
        lambda x: ', '.join(sorted(item.strip() for item in x.split(',') if item.strip()))
    )

# Drop duplicates based on all columns except Timestamp
cols_to_check = [col for col in df_clean.columns if 'timestamp' not in col.lower()]
df_clean = df_clean.drop_duplicates(subset=cols_to_check)

df_clean

Unnamed: 0,Timestamp,(1) What did you have for breakfast today from the dining hall?,(2) What did you have for lunch from the dining hall? (yesterday),(3) What did you have for dinner from the dining hall? (yesterday),How have you liked the dining hall food so far?
0,7/8/2025 9:48:55,"cereal, milk","burger, pizza",mongolian beef and rice,8.0
1,7/8/2025 9:49:04,yogurt,seared salmon,mongolian beef and rice,5.0
2,7/8/2025 9:49:18,"nothing, potato wedges, yogurt and sausage",mongolian wok,"mongolian beef and rice, the beef was so bad",6.0
3,7/8/2025 9:49:30,"cereal, i ate the omelette, ice cream, milk, n...","burger, carnitas, chicken and rice, i ate the ...","burger, mongolian beef and rice, nothing, othe...",10.0
4,7/8/2025 9:49:39,"cereal, ice cream, milk, nothing, omelets, par...","a cow, burger, carnitas, chicken and rice, mon...","burger, mongolian beef and rice, nothing, pizz...",7.0
5,7/8/2025 9:49:46,milk,nothing,nothing,2.0
6,7/8/2025 9:49:48,"potato wedges, the eggs are not good because t...","burger, carnitas, salad, seared salmon, vegan ...","burger, nothing, pizza, soup",6.0
7,7/8/2025 9:50:04,banana,"carnitas, chicken and rice, seared salmon","mongolian beef and rice, pizza, quesadilla",8.0
8,7/8/2025 9:50:06,"nothing, omelets, potato wedges","burger, carnitas, chicken and rice, mongolian ...",mongolian beef and rice,1.0
9,7/8/2025 9:50:09,parfeit,"fries, seared salmon",pizza,7.0
