In [1]:
import pandas as pd
import sqlalchemy, os, re, pymysql
from datetime import datetime, timedelta
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
load_dotenv('config.env')
username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
host = os.getenv("DB_HOST")
database_name = os.getenv("DB_NAME")

connect = f"mysql+pymysql://{username}:{password}@{host}/{database_name}"
engine = sqlalchemy.create_engine(connect)
conn = engine.connect()

In [3]:
def sql_to_dataframe(table, engine):
    q = f'SELECT * FROM {table}'
    return pd.read_sql(q, engine)

user_df = sql_to_dataframe('user', engine=engine)
exercise_set_df = sql_to_dataframe('exercise_set', engine=engine)
common_user_ids = pd.Index(user_df['id']).intersection(exercise_set_df['user_id'])

filtered_user_df = user_df[user_df['id'].isin(common_user_ids)]
filtered_exercise_set_df = exercise_set_df[exercise_set_df['user_id'].isin(common_user_ids)]


In [4]:
class PumpX_Transform():

    def __init__(self, user_df, exercise_set_df) -> None:
        self.user_df = user_df
        self.exercise_set_df = exercise_set_df

    def get_total_loggings_week(self):

#         logging_date 	total_loggings
#         0 	2024-05-13 	30
#         1 	2024-05-14 	39
#         2 	2024-05-15 	36
#         3 	2024-05-16 	37
#         4 	2024-05-17 	29
#         5 	2024-05-18 	26
#         6 	2024-05-19 	26

        result_dataframe = self.exercise_set_df
        result_dataframe['approach_time'] = pd.to_datetime(result_dataframe['approach_time'], errors='coerce')
        result_dataframe.dropna(subset=['approach_time'], inplace=True)
        current_week = pd.Timestamp.now().isocalendar()[1]
        result_dataframe['week_number'] = result_dataframe['approach_time'].apply(lambda x: x.isocalendar()[1])
        current_year = pd.Timestamp.now().year
        result_dataframe['year'] = result_dataframe['approach_time'].apply(lambda x: x.isocalendar()[0])
        filtered_df = result_dataframe[(result_dataframe['week_number'] == current_week) & (result_dataframe['year'] == current_year)]
        result = filtered_df.groupby(filtered_df['approach_time'].dt.date)['user_id'].nunique().reset_index()
        result.columns = ['logging_date', 'total_loggings']
        return result
    
    def get_goal_and_age_distribution(self):
        
#         age_group 	goal 	under_20 	age_20_29 	age_30_39 	age_40_49 	age_50_59 	age_60_above
#         0 	Gain Muscle Mass 	67 	143 	66 	24 	6 	1
#         1 	Improve Endurance 	0 	7 	1 	0 	0 	0
#         2 	Lose Weight 	12 	45 	29 	10 	7 	0

        labels = ['under_20', 'age_20_29', 'age_30_39', 'age_40_49', 'age_50_59', 'age_60_above']
        result_dataframe = self.user_df[(self.user_df['goal'].notna()) & (self.user_df['age'].notna())]
        bins = [0, 20, 30, 40, 50, 60, 100]
        result_dataframe['age_group'] = pd.cut(result_dataframe['age'], bins=bins, labels=labels, right=False)
        return result_dataframe.groupby(['goal', 'age_group'])['id'].nunique().unstack(fill_value=0).reset_index()
    
    def categorize_logging_usage(self, count):
        if (count == 1): return "First_time_logging"
        elif (count == 2): return "Second_time_logging"
        return "Three_or_more_loggings"
    
    def get_logging_usage_distribution(self):
        copy_dataframe = self.exercise_set_df.copy()
        copy_dataframe['approach_time'] = pd.to_datetime(copy_dataframe['approach_time'])

        result = copy_dataframe.groupby('user_id').agg(
            last_date_logged=('approach_time', 'max'),
            count_logged=('user_id', 'count')
        ).reset_index()

        result['last_date_logged'] = pd.to_datetime(result['last_date_logged'])

        now = datetime.now()
        curr_week = now - timedelta(days=now.weekday() + 1)
        end_curr_week = curr_week + timedelta(days=7)

        result['year'] = result['last_date_logged'].apply(lambda x: x.isocalendar()[0])
        result = result[(result['last_date_logged'] >= curr_week) & 
                        (result['last_date_logged'] <= end_curr_week) & 
                        (result['year'] == pd.Timestamp.now().year)]

        result['log_category'] = result['count_logged'].apply(self.categorize_logging_usage)
        
        result['date'] = result['last_date_logged'].dt.date
        grouped_daily_df = result.groupby(['date', 'log_category'])['count_logged'].count().unstack(fill_value=0).reset_index()
        
        return grouped_daily_df


In [None]:
class batch_processing:
    