In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Import fuzzy matching library
from rapidfuzz import process, fuzz

# Step 2: Load datasets (adjust paths if needed)
gym_members_df = pd.read_csv('gym_members_exercise_tracking.csv')
workout_tracker_df = pd.read_csv('workout_fitness_tracker_data.csv')

# Step 3: Clean text function for merge keys (improved with extra normalization)
def clean_text(s):
    if isinstance(s, str):
        s = s.lower().strip()
        s = s.replace('-', ' ').replace('_', ' ')
        s = ''.join(c for c in s if c.isalnum() or c.isspace())
        s = ' '.join(word for word in s.split() if word not in {'the', 'and', 'of', 'in'})  # remove common stopwords
        return s
    return ''

# Step 4: Clean and create merge keys
gym_members_df['workout_clean'] = gym_members_df['Workout_Type'].apply(clean_text)
workout_tracker_df['workout_clean'] = workout_tracker_df['Workout Type'].apply(clean_text)

# Step 5: Fuzzy matching - find best matches for each gym workout in tracker dataset

# Make list of unique workouts in tracker dataset for matching
tracker_workouts = workout_tracker_df['workout_clean'].unique().tolist()

def get_best_match(name):
    if not name:
        return None
    match, score, _ = process.extractOne(name, tracker_workouts, scorer=fuzz.token_sort_ratio)
    return match if score >= 80 else None  # threshold can be adjusted

# Apply fuzzy matching to gym_members workout_clean column
gym_members_df['fuzzy_match'] = gym_members_df['workout_clean'].apply(get_best_match)

# Filter gym_members to those successfully matched
gym_members_filtered = gym_members_df[gym_members_df['fuzzy_match'].notna()].copy()
print(f"Number of fuzzy matched workouts: {len(gym_members_filtered)}")

# Step 6 (Optional): Merge on broader category if available
# Uncomment & adjust if you want to try merge on a broader feature like 'Type' or 'Category'
# Example:
# if 'Type' in gym_members_df.columns and 'Type' in workout_tracker_df.columns:
#     combined_df = pd.merge(gym_members_df, workout_tracker_df, left_on='Type', right_on='Type', how='inner')
#     print(f"Shape after merge on Type: {combined_df.shape}")
# else:
#     # Use fuzzy match key for merging (Step 7 below)

# Step 7: Merge on fuzzy matched workout names
combined_df = pd.merge(
    gym_members_filtered, workout_tracker_df,
    left_on='fuzzy_match', right_on='workout_clean',
    how='left', suffixes=('_gym', '_tracker')
)
print(f"Combined shape after fuzzy merge: {combined_df.shape}")

# Step 8: Handle missing values - avoid inplace chained assignment
for col in combined_df.select_dtypes(include=[np.number]).columns:
    combined_df[col] = combined_df[col].fillna(combined_df[col].median())

for col in combined_df.select_dtypes(include='object').columns:
    combined_df[col] = combined_df[col].fillna('missing')

# Step 9: Define feature columns as before (ensure names exist)
feature_cols = [
    'Age_gym', 'Gender_gym', 'Weight (kg)_gym', 'Height (m)', 'Calories_Burned_gym',
    'Max_BPM', 'Avg_BPM', 'Resting_BPM', 'Session_Duration (hours)',
    'Workout Duration (mins)', 'Calories Burned', 'Heart Rate (bpm)', 'Steps Taken',
    'Distance (km)', 'Workout Intensity'
]
feature_cols = [col for col in feature_cols if col in combined_df.columns]

# Identify categorical and numeric features
cat_features = [col for col in feature_cols if combined_df[col].dtype == 'object']
num_features = [col for col in feature_cols if col not in cat_features]

# Label encode categorical features
for col in cat_features:
    le = LabelEncoder()
    combined_df[col] = le.fit_transform(combined_df[col].astype(str))

# Step 10: Prepare X, y for model
X = combined_df[feature_cols].copy()  # avoid SettingWithCopyWarning

if 'Experience_Level' in combined_df.columns:
    y = combined_df['Experience_Level']
elif 'Workout Intensity' in combined_df.columns:
    y = combined_df['Workout Intensity']
else:
    raise ValueError("No suitable target column found")

# Step 11: Scale numeric features
scaler = StandardScaler()
X[num_features] = scaler.fit_transform(X[num_features])

# Step 12: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 13: Train Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Step 14: Predict and evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Number of fuzzy matched workouts: 973
Combined shape after fuzzy merge: (1630286, 38)


In [None]:
import pandas as pd
import numpy as np

# Load your datasets
gym_members_df = pd.read_csv('gym_members_exercise_tracking.csv')
workout_tracker_df = pd.read_csv('workout_fitness_tracker_data.csv')

# Clean workout names for merge keys, fuzzy matching etc.
def clean_text(s):
    if isinstance(s, str):
        s = s.lower().strip()
        s = s.replace('-', ' ').replace('_', ' ')
        s = ''.join(c for c in s if c.isalnum() or c.isspace())
        return s
    return ''

gym_members_df['workout_clean'] = gym_members_df['Workout_Type'].apply(clean_text)
workout_tracker_df['workout_clean'] = workout_tracker_df['Workout Type'].apply(clean_text)

# Here you would do fuzzy matching and merge (as in your original code)
# For brevity, simple inner merge on workout_clean shown
combined_df = pd.merge(gym_members_df, workout_tracker_df, on='workout_clean', how='inner', suffixes=('_gym', '_tracker'))

print(f"Combined df shape: {combined_df.shape}")