In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Import fuzzy matching library
from rapidfuzz import process, fuzz

# Step 2: Load datasets (adjust paths if needed)
gym_members_df = pd.read_csv('gym_members_exercise_tracking.csv')
workout_tracker_df = pd.read_csv('workout_fitness_tracker_data.csv')

# Step 3: Clean text function for merge keys (improved with extra normalization)
def clean_text(s):
    if isinstance(s, str):
        s = s.lower().strip()
        s = s.replace('-', ' ').replace('_', ' ')
        s = ''.join(c for c in s if c.isalnum() or c.isspace())
        s = ' '.join(word for word in s.split() if word not in {'the', 'and', 'of', 'in'})  # remove common stopwords
        return s
    return ''

# Step 4: Clean and create merge keys
gym_members_df['workout_clean'] = gym_members_df['Workout_Type'].apply(clean_text)
workout_tracker_df['workout_clean'] = workout_tracker_df['Workout Type'].apply(clean_text)

# Step 5: Fuzzy matching - find best matches for each gym workout in tracker dataset

# Make list of unique workouts in tracker dataset for matching
tracker_workouts = workout_tracker_df['workout_clean'].unique().tolist()

def get_best_match(name):
    if not name:
        return None
    match, score, _ = process.extractOne(name, tracker_workouts, scorer=fuzz.token_sort_ratio)
    return match if score >= 80 else None  # threshold can be adjusted

# Apply fuzzy matching to gym_members workout_clean column
gym_members_df['fuzzy_match'] = gym_members_df['workout_clean'].apply(get_best_match)

# Filter gym_members to those successfully matched
gym_members_filtered = gym_members_df[gym_members_df['fuzzy_match'].notna()].copy()
print(f"Number of fuzzy matched workouts: {len(gym_members_filtered)}")

# Step 6 (Optional): Merge on broader category if available
# Uncomment & adjust if you want to try merge on a broader feature like 'Type' or 'Category'
# Example:
# if 'Type' in gym_members_df.columns and 'Type' in workout_tracker_df.columns:
#     combined_df = pd.merge(gym_members_df, workout_tracker_df, left_on='Type', right_on='Type', how='inner')
#     print(f"Shape after merge on Type: {combined_df.shape}")
# else:
#     # Use fuzzy match key for merging (Step 7 below)

# Step 7: Merge on fuzzy matched workout names
combined_df = pd.merge(
    gym_members_filtered, workout_tracker_df,
    left_on='fuzzy_match', right_on='workout_clean',
    how='left', suffixes=('_gym', '_tracker')
)
print(f"Combined shape after fuzzy merge: {combined_df.shape}")

# Step 8: Handle missing values - avoid inplace chained assignment
for col in combined_df.select_dtypes(include=[np.number]).columns:
    combined_df[col] = combined_df[col].fillna(combined_df[col].median())

for col in combined_df.select_dtypes(include='object').columns:
    combined_df[col] = combined_df[col].fillna('missing')

# Step 9: Define feature columns as before (ensure names exist)
feature_cols = [
    'Age_gym', 'Gender_gym', 'Weight (kg)_gym', 'Height (m)', 'Calories_Burned_gym',
    'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)',
    'Workout Duration (mins)', 'Calories Burned', 'Heart Rate (bpm)', 'Steps Taken',
    'Distance (km)', 'Workout Intensity'
]
feature_cols = [col for col in feature_cols if col in combined_df.columns]

# Identify categorical and numeric features
cat_features = [col for col in feature_cols if combined_df[col].dtype == 'object']
num_features = [col for col in feature_cols if col not in cat_features]

# Label encode categorical features
for col in cat_features:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))

# Step 10: Prepare X, y for model
X = combined_df[feature_cols].copy()  # avoid SettingWithCopyWarning

if 'Experience_Level' in combined_df.columns:
    y = combined_df['Experience_Level']
elif 'Workout Intensity' in combined_df.columns:
    y = combined_df['Workout Intensity']
else:
    raise ValueError("No suitable target column found")

# Step 11: Scale numeric features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

# Step 12: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 13: Train Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Step 14: Predict and evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Number of fuzzy matched workouts: 973
Combined shape after fuzzy merge: (1630286, 38)


In [2]:
import pandas as pd
import numpy as np

# Load datasets
gym_members_df = pd.read_csv('gym_members_exercise_tracking.csv')
workout_tracker_df = pd.read_csv('workout_fitness_tracker_data.csv')

# Vectorized text cleaning function for workout type columns
def fast_clean_text(series):
    return (series.str.lower()
                  .str.strip()
                  .str.replace('[-_]', ' ', regex=True)          # replace hyphens and underscores with space
                  .str.replace('[^a-z0-9 ]', '', regex=True)    # keep only alphanumeric and spaces
                  .str.replace(r'\s+', ' ', regex=True))        # collapse multiple spaces into one

# Apply cleaning with vectorized pandas string methods (much faster than apply)
gym_members_df['workout_clean'] = fast_clean_text(gym_members_df['Workout_Type'])
workout_tracker_df['workout_clean'] = fast_clean_text(workout_tracker_df['Workout Type'])

# Optional: convert to category dtype for memory efficiency and faster merge (optional but recommended)
gym_members_df['workout_clean'] = gym_members_df['workout_clean'].astype('category')
workout_tracker_df['workout_clean'] = workout_tracker_df['workout_clean'].astype('category')

# Perform inner join on cleaned workout names
combined_df = pd.merge(
    gym_members_df,
    workout_tracker_df,
    how='inner',
    on='workout_clean',
    suffixes=('_gym', '_tracker'),
    copy=False     # avoid unnecessary copies for speed
)

print(f"Combined df shape: {combined_df.shape}")


Combined df shape: (1630286, 36)


In [1]:
print("hello")

hello


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- Your merged DataFrame is combined_df ---

# 1. Data Summary and Validation
print(f"Data shape: {combined_df.shape}")
print(f"Columns: {combined_df.columns.tolist()}")

# Check key columns existence
required_cols = ['workout_clean', 'Experience_Level', 'Workout Intensity']
for col in required_cols:
    if col in combined_df.columns:
        print(f"Column '{col}': {combined_df[col].shape[0]} values, nulls={combined_df[col].isnull().sum()}")
    else:
        print(f"Warning: Column '{col}' NOT FOUND in dataset")

# Preview distinct samples in workout_clean
if 'workout_clean' in combined_df.columns:
    print("Sample unique workout_clean values:", combined_df['workout_clean'].dropna().unique()[:10])

# 2. Handle Missing Values Safely
num_cols = combined_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = combined_df.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical cols count: {len(num_cols)}; Categorical cols count: {len(cat_cols)}")

# Fill numerical missing values
for col in num_cols:
    median_val = combined_df[col].median()
    combined_df[col] = combined_df[col].fillna(median_val)
    if combined_df[col].isnull().sum() > 0:
        print(f"Warning: Nulls remain in numerical column '{col}' after fillna")

# Fill categorical missing values
for col in cat_cols:
    combined_df[col] = combined_df[col].fillna('missing')
    if combined_df[col].isnull().sum() > 0:
        print(f"Warning: Nulls remain in categorical column '{col}' after fillna")

# 3. Feature and Target Selection

# Assuming combined_df already loaded and processed
feature_cols = [
    'Age_gym', 'Gender_gym', 'Weight (kg)_gym', 'Height (m)', 'Calories_Burned_gym',
    'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)',
    'Workout Duration (mins)', 'Calories Burned', 'Heart Rate (bpm)', 'Steps Taken',
    'Distance (km)', 'Workout Intensity'
]

feature_cols = [f for f in feature_cols if f in combined_df.columns]
X = combined_df[feature_cols].copy()


# Identify categorical and numerical features in X
cat_features = [c for c in feature_cols if X[c].dtype == 'object']
num_features = [c for c in feature_cols if c not in cat_features]
print(f"Categorical features: {cat_features}")
print(f"Numerical features: {num_features}")

# Encode categorical features
for col in cat_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Select target column
if 'Experience_Level' in combined_df.columns:
    y = combined_df['Experience_Level']
    print("Using 'Experience_Level' as target")
elif 'Workout Intensity' in combined_df.columns:
    y = combined_df['Workout Intensity']
    print("Using 'Workout Intensity' as target")
else:
    raise ValueError("No target column found! Please check your dataset.")

# 4. Scale numerical features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

# 5. Train-Test Split (stratify to maintain target distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# 6. Model Training & Evaluation
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Model accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

Data shape: (1630286, 36)
Columns: ['Age_gym', 'Gender_gym', 'Weight (kg)_gym', 'Height (m)', 'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)', 'Calories_Burned', 'Workout_Type', 'Fat_Percentage', 'Water_Intake (liters)', 'Workout_Frequency (days/week)', 'Experience_Level', 'BMI', 'workout_clean', 'User ID', 'Age_tracker', 'Gender_tracker', 'Height (cm)', 'Weight (kg)_tracker', 'Workout Type', 'Workout Duration (mins)', 'Calories Burned', 'Heart Rate (bpm)', 'Steps Taken', 'Distance (km)', 'Workout Intensity', 'Sleep Hours', 'Water Intake (liters)', 'Daily Calories Intake', 'Resting Heart Rate (bpm)', 'VO2 Max', 'Body Fat (%)', 'Mood Before Workout', 'Mood After Workout']
Column 'workout_clean': 1630286 values, nulls=0
Column 'Experience_Level': 1630286 values, nulls=0
Column 'Workout Intensity': 1630286 values, nulls=0
Sample unique workout_clean values: ['yoga' 'hiit' 'cardio' 'strength']
Numerical cols count: 28; Categorical cols count: 8
Categorical features: ['Gende

In [1]:
print(f"Shape of X: {X.shape}")
print("Memory usage by columns (MB):")
print(X.memory_usage(deep=True) / 1024**2)


NameError: name 'X' is not defined

In [None]:
# Cell 4: Fast categorical encoding using pandas categorical dtype

# Copy X to avoid modifying original DataFrame
X_encoded = X.copy()

# List of categorical columns to encode
categorical_cols = ['Gender_gym', 'Workout Intensity']

for col in categorical_cols:
    if col in X_encoded.columns:
        # Convert column to category dtype and get integer codes fast
        X_encoded[col] = X_encoded[col].astype('category').cat.codes

# Check encoding result summary
print("Categorical columns encoded:")
for col in categorical_cols:
    if col in X_encoded.columns:
        print(f"{col}: {X_encoded[col].nunique()} unique categories encoded")


In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns (features minus categorical)
numerical_cols = [col for col in X_encoded.columns if col not in categorical_cols]

scaler = StandardScaler()
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

# Preview scaled features
print("Preview scaled numerical features:")
print(X_encoded[numerical_cols].head())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Encode target if needed
if y.dtype == 'object' or str(y.dtype) == 'category':
    target_le = LabelEncoder()
    y_encoded = target_le.fit_transform(y.astype(str))
else:
    y_encoded = y

# Train-test split with stratify to keep target distribution balanced
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Initialize and train model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_test)

print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))
