In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('Team_Sports_Survey.csv')
data

In [None]:
# Rename Columns
data.rename(
    columns={
        data.columns[6]: 'First Preferred Activity Upon Resort Booking',
        data.columns[7]: 'Chosen Activities',
        data.columns[8]: 'Expected Time at Swimming Pool',
        data.columns[9]: 'Expected Time at Beach',
        data.columns[10]: 'Expected Time in Recreational Room',
        data.columns[11]: 'Expected Time at Spa and Wellness',
        data.columns[12]: 'Expected Time at Gym and Training Area',
        data.columns[13]: 'Expected Time at Sports Center',
        data.columns[14]: 'Preferred Swimming Pool Activities',
        data.columns[15]: 'Top Priority Swimming Pool Activity',
        data.columns[16]: 'Expected Shower Room Items (For Swimming Participants)',
        data.columns[17]: 'Preferred Beach Activities',
        data.columns[18]: 'Top Priority Beach Activity',
        data.columns[19]: 'Preferred Recreational Room Activities',
        data.columns[20]: 'Top Priority Recreational Room Activity',
        data.columns[21]: 'Preferred Spa and Wellness Activities',
        data.columns[22]: 'Top Priority Spa and Wellness Activity',
        data.columns[23]: 'Preferred Gym and Training Activities',
        data.columns[24]: 'Top Priority Gym and Training Activity',
        data.columns[25]: 'Preferred Sports Center Activities',
        data.columns[26]: 'Top Priority Sports Center Activity',
        data.columns[27]: 'Preferred Check-in Days',
        data.columns[28]: 'Number of Companions',
        data.columns[29]: 'Willingness to Pay Separate Entrance Fees',
        data.columns[30]: 'Maximum Wait Time for Swimming Pool',
        data.columns[31]: 'Maximum Wait Time for Beach',
        data.columns[32]: 'Maximum Wait Time for Recreational Room',
        data.columns[33]: 'Maximum Wait Time for Spa and Wellness',
        data.columns[34]: 'Maximum Wait Time for Gym and Training Area',
        data.columns[35]: 'Maximum Wait Time for Sports Center',
        data.columns[36]: 'Post-Recreational Activity',
        data.columns[37]: 'Preferred Time for Swimming Pool Activities',
        data.columns[38]: 'Preferred Time for Beach Activities',
        data.columns[39]: 'Preferred Time for Recreational Room Activities',
        data.columns[40]: 'Preferred Time for Spa and Wellness Activities',
        data.columns[41]: 'Preferred Time for Gym and Training Activities',
        data.columns[42]: 'Preferred Time for Sports Center Activities',
        data.columns[43]: 'Snack and Beverage Consumption During Activities',
        data.columns[44]: 'Preferred Snacks During Activities',
        data.columns[45]: 'Preferred Beverage During Activities'
    }, inplace=True
)
data

In [None]:
# Define columns with time-related data
time_columns = [
    'Expected Time at Swimming Pool', 'Expected Time at Beach', 'Expected Time in Recreational Room',
    'Expected Time at Spa and Wellness', 'Expected Time at Gym and Training Area',
    'Expected Time at Sports Center', 'Preferred Time for Swimming Pool Activities',
    'Preferred Time for Beach Activities', 'Preferred Time for Recreational Room Activities',
    'Preferred Time for Spa and Wellness Activities', 'Preferred Time for Gym and Training Activities',
    'Preferred Time for Sports Center Activities'
]

# Replace "12AM" with "12PM" in the specified columns, considering cases where it appears inside a range
data[time_columns] = data[time_columns].apply(lambda x: x.str.replace("12AM", "12PM", regex=True))

data

In [None]:
# Organize the columns into logical groups
column_order = [
    # Demographics
    'Gender', 'Occupational Status', 'Marital Status', 'Age Group', 'Monthly Income (Php)',
    
    # Booking Preferences
    'First Preferred Activity Upon Resort Booking', 'Chosen Activities',
    
    # Expected Time at Facilities
    'Expected Time at Swimming Pool', 'Expected Time at Beach', 'Expected Time in Recreational Room',
    'Expected Time at Spa and Wellness', 'Expected Time at Gym and Training Area', 'Expected Time at Sports Center',
    
    # Preferred Activities by Facility
    'Preferred Swimming Pool Activities', 'Top Priority Swimming Pool Activity',
    'Preferred Beach Activities', 'Top Priority Beach Activity',
    'Preferred Recreational Room Activities', 'Top Priority Recreational Room Activity',
    'Preferred Spa and Wellness Activities', 'Top Priority Spa and Wellness Activity',
    'Preferred Gym and Training Activities', 'Top Priority Gym and Training Activity',
    'Preferred Sports Center Activities', 'Top Priority Sports Center Activity',
    
    # Facility Usage Preferences
    'Preferred Check-in Days', 'Number of Companions', 'Willingness to Pay Separate Entrance Fees',
    
    # Maximum Wait Times
    'Maximum Wait Time for Swimming Pool', 'Maximum Wait Time for Beach', 'Maximum Wait Time for Recreational Room',
    'Maximum Wait Time for Spa and Wellness', 'Maximum Wait Time for Gym and Training Area', 'Maximum Wait Time for Sports Center',
    
    # Post-Activity Preferences
    'Post-Recreational Activity',
    
    # Preferred Time for Activities
    'Preferred Time for Swimming Pool Activities', 'Preferred Time for Beach Activities', 'Preferred Time for Recreational Room Activities',
    'Preferred Time for Spa and Wellness Activities', 'Preferred Time for Gym and Training Activities', 'Preferred Time for Sports Center Activities',
    
    # Snacks and Beverages
    'Snack and Beverage Consumption During Activities', 'Preferred Snacks During Activities', 'Preferred Beverage During Activities',
    
    # Additional Preferences
    'Expected Shower Room Items (For Swimming Participants)'
]

# Reorder the dataset columns
data = data[column_order]

data

In [None]:
# Modify Age Group column to replace words with symbols
data.loc[:, 'Age Group'] = data['Age Group'].str.replace("and below", "<", regex=False)
data.loc[:, 'Age Group'] = data['Age Group'].str.replace("and above", ">", regex=False)

data

In [None]:
# Modify Monthly Income (Php) column to replace words with symbols
data.loc[:, 'Monthly Income (Php)'] = data['Monthly Income (Php)'].str.replace("Less than", "<", regex=False)

data

In [None]:
#For identifying which columns have empty values
columns_with_empty_values = data.isna().any(axis=0) | (data == '').any(axis=0)

print("Columns with empty values:")
print(columns_with_empty_values)

In [None]:
#Impute 'none' for activity and item columns
data.fillna({'Preferred Swimming Pool Activities': 'None', 'Top Priority Swimming Pool Activity': 'None','Preferred Beach Activities':'None','Top Priority Beach Activity':'None','Preferred Recreational Room Activities':'None','Top Priority Recreational Room Activity':'None','Preferred Spa and Wellness Activities':'None','Top Priority Spa and Wellness Activity':'None','Preferred Gym and Training Activities':'None','Top Priority Gym and Training Activity':'None','Preferred Sports Center Activities':'None','Top Priority Sports Center Activity':'None','Post-Recreational Activity':'None','Preferred Snacks During Activities':'None','Preferred Beverage During Activities':'None','Expected Shower Room Items (For Swimming Participants)':'None'}, inplace=True)
data

In [None]:
#Replace values in preferred snacks and preferred beverages if they answered No in consuming snacks and beverage during activities
data.loc[data['Snack and Beverage Consumption During Activities'] == 'No', ['Preferred Snacks During Activities','Preferred Beverage During Activities']] = 'N/A'
data



In [None]:
#Impute '0' for time columns
data.fillna({'Expected Time at Swimming Pool':0, 'Expected Time at Beach':0,'Expected Time in Recreational Room':0,'Expected Time at Spa and Wellness':0,'Expected Time at Gym and Training Area':0,'Expected Time at Sports Center':0,'Maximum Wait Time for Swimming Pool':0,'Maximum Wait Time for Beach':0,'Maximum Wait Time for Recreational Room':0,'Maximum Wait Time for Spa and Wellness':0,'Maximum Wait Time for Gym and Training Area':0,'Maximum Wait Time for Sports Center':0}, inplace=True)
data

In [None]:
# Drop the columns that will not be used for the descriptive and predictive analysis
data.drop(columns=[
    "Occupational Status", "Marital Status", "Recreational Room",
    "Gym and Training Area", "Sports Center",
    "Preferred Recreational Room Activities", "Top Priority Recreational Room Activity",
    "Preferred Gym and Training Activities", "Top Priority Gym and Training Activity",
    "Preferred Sports Center Activities", "Top Priority Sports Center Activity",
    "Post-Recreational Activity",
    "Preferred Time for Recreational Room Activities", "Preferred Time for Gym and Training Activities",
    "Preferred Time for Sports Center Activities",
    "Maximum Wait Time for Recreational Room", "Maximum Wait Time for Gym and Training Area",
    "Maximum Wait Time for Sports Center",
    "Expected Time in Recreational Room", "Expected Time at Gym and Training Area",
    "Expected Time at Sports Center"
], errors='ignore', inplace=True)
data

In [None]:
# Save the preprocessed data to a new CSV file
data.to_csv('Team_Sports_Survey_Corrected.csv', index=False)

In [38]:
# Change the value of 'Preferred Snacks During Activities' column and 'Preferred Beverage During Activities' column to N/A if the value is No in column 'Snack and Beverage Consumption During Activities' due to inaccuracy in the survey
data.loc[data['Snack and Beverage Consumption During Activities'] == 'No', ['Preferred Snacks During Activities','Preferred Beverage During Activities']] = 'N/A'
data.to_csv('Team_Sports_Survey_Corrected.csv', index=False)