In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("Gym_Members_Exercise_Dataset.csv")

# Display the first few rows of the dataset
print("Dataset Preview:")
print(df.head())

# Check basic information about the dataset
print("\nDataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Statistical summary of the dataset
print("\nStatistical Summary:")
print(df.describe())

# Drop duplicates if any
df.drop_duplicates(inplace=True)

# Convert columns to appropriate data types if needed
df['Age'] = df['Age'].astype(int)
df['Gender'] = df['Gender'].astype('category')

# Check for any outliers using boxplots
plt.figure(figsize=(14, 6))
sns.boxplot(data=df[['Weight (kg)', 'Height (m)', 'BMI', 'Fat_Percentage', 'Muscle_Mass_Percentage']])
plt.title("Boxplot of Numerical Columns")
plt.xticks(rotation=45)
plt.show()

# Distribution of Age and Gender
plt.figure(figsize=(12, 6))
sns.histplot(df['Age'], kde=True, color='skyblue')
plt.title("Age Distribution")
plt.show()

plt.figure(figsize=(8, 4))
sns.countplot(data=df, x='Gender', palette='Set2')
plt.title("Gender Distribution")
plt.show()

# Distribution of Workout Type
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Workout_Type', palette='viridis')
plt.title("Distribution of Workout Types")
plt.xticks(rotation=45)
plt.show()

# Relationship between Workout Frequency and Fat Percentage
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Workout_Frequency (days/week)', y='Fat_Percentage', hue='Gender')
plt.title("Effect of Workout Frequency on Fat Percentage")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

# Scatter plot for Session Duration vs Calories Burned
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='Session_Duration (hours)', y='Calories_Burned', hue='Workout_Type', palette='Set1')
plt.title("Session Duration vs Calories Burned by Workout Type")
plt.show()

# Relationship between BMI and Fat Percentage
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='BMI', y='Fat_Percentage', hue='Gender', palette='Set2')
plt.title("BMI vs Fat Percentage")
plt.show()

# Analysis of Muscle Mass Percentage
plt.figure(figsize=(12, 6))
sns.histplot(df['Muscle_Mass_Percentage'], kde=True, color='green')
plt.title("Distribution of Muscle Mass Percentage")
plt.show()

# Top 10 records with the highest BMI
top_10_bmi = df.nlargest(10, 'BMI')
print("\nTop 10 Highest BMI Records:")
print(top_10_bmi)

# Bottom 10 records with the lowest BMI
bottom_10_bmi = df.nsmallest(10, 'BMI')
print("\nTop 10 Lowest BMI Records:")
print(bottom_10_bmi)

# Summary of key statistics for workout types
grouped_df = df.groupby('Workout_Type').agg({
    'Session_Duration (hours)': 'mean',
    'Calories_Burned': 'mean',
    'Fat_Percentage': 'mean',
    'Muscle_Mass_Percentage': 'mean'
}).reset_index()

print("\nSummary Statistics by Workout Type:")
print(grouped_df)

# Plot average fat percentage and muscle mass percentage by workout type
plt.figure(figsize=(14, 6))
sns.barplot(data=grouped_df, x='Workout_Type', y='Fat_Percentage', color='lightcoral', label='Average Fat Percentage')
sns.lineplot(data=grouped_df, x='Workout_Type', y='Muscle_Mass_Percentage', color='blue', marker='o', label='Average Muscle Mass Percentage')
plt.title("Average Fat Percentage and Muscle Mass Percentage by Workout Type")
plt.xticks(rotation=45)
plt.legend()
plt.show()
