In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve

warnings.filterwarnings('ignore')

In [None]:
file_path = r"C:\Users\David\PREDICT TIKTOK\TikTok\Your Activity\Watch History.txt"
TARGET_TIMEZONE = 'Asia/Kuala_Lumpur'

In [None]:
def load_and_process_data(file_path):
    """
    Parses the raw text file to extract timestamps.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        return None

    # Regex to find Date: YYYY-MM-DD HH:MM:SS UTC
    # We only care about the time, the link is irrelevant for the 'Clock'
    regex_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\s*UTC"
    matches = re.findall(regex_pattern, content)

    if not matches:
        print("No valid timestamps found.")
        return None

    df = pd.DataFrame(matches, columns=['timestamp'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Convert UTC to User Timezone
    df['timestamp'] = df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(TARGET_TIMEZONE)
    return df

In [None]:
def generate_addiction_clock(df):
    """
    Creates the Hour vs. Day heatmap.
    """
    # Extract Hour and Day
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.day_name()

    # Order days logically, not alphabetically
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=days_order, ordered=True)

    # Create the Matrix (Pivot Table)
    heatmap_data = df.groupby(['day_of_week', 'hour']).size().unstack(fill_value=0)
    
    # Reindex to ensure 24 hours and 7 days are always present (even if 0 activity)
    heatmap_data = heatmap_data.reindex(days_order)
    heatmap_data = heatmap_data.reindex(columns=range(24), fill_value=0)

    # Plotting
    plt.figure(figsize=(12, 6))
    sns.heatmap(heatmap_data, cmap='magma', linewidths=.5, annot=False)
    plt.title('The "Addiction Clock": Activity Heatmap of 26 weeks')
    plt.xlabel('Hour of Day (0-23)')
    plt.ylabel('Day of Week')
    plt.show()

In [None]:
if __name__ == "__main__":
    df = load_and_process_data(file_path)
    if df is not None:
        print(f"Data Loaded: {len(df)} records found.")
        generate_addiction_clock(df)
    else:
        print("Could not load data. Check file path.")

## MINUTES WATCHED IN 26 WEEKS

In [None]:
GAP_THRESHOLD_MINUTES = 10
DEFAULT_WATCH_SECONDS = 30 # Duration assumed if gap is too large

def parse_and_process_history(file_path):
    # 1. Load Data
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print("File not found.")
        return None

    # 2. Extract Timestamps
    regex_pattern = r"Date:\s*(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\s*UTC"
    matches = re.findall(regex_pattern, content)
    
    if not matches:
        print("No valid data found.")
        return None

    df = pd.DataFrame(matches, columns=['timestamp'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # 3. Sort Ascending (Critical for Gap Calculation)
    # The file is usually newest-first, so we must reverse it to calculate
    # duration = next_start - current_start
    df = df.sort_values('timestamp').reset_index(drop=True)
    
    # 4. Calculate Gaps (Duration)
    # Shift(-1) lets us see the NEXT video's timestamp on the current row
    df['next_timestamp'] = df['timestamp'].shift(-1)
    
    # Calculate difference in seconds
    df['duration_seconds'] = (df['next_timestamp'] - df['timestamp']).dt.total_seconds()
    
    # 5. Apply "Gap Theory" Logic
    # If gap is NaN (last video) OR gap > threshold, use default 30s
    # Else, use the actual gap
    threshold_seconds = GAP_THRESHOLD_MINUTES * 60
    
    def clean_duration(row):
        if pd.isna(row['duration_seconds']):
            return DEFAULT_WATCH_SECONDS
        if row['duration_seconds'] > threshold_seconds:
            return DEFAULT_WATCH_SECONDS
        return row['duration_seconds']

    df['minutes_watched'] = df.apply(clean_duration, axis=1) / 60
    
    # 6. Localize Timezone (for Day/Hour extraction)
    df['local_time'] = df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(TARGET_TIMEZONE)
    df['hour'] = df['local_time'].dt.hour
    df['day_of_week'] = df['local_time'].dt.day_name()
    
    return df



In [None]:
def plot_minutes_heatmap(df):
    # Order days correctly
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df['day_of_week'] = pd.Categorical(df['day_of_week'], categories=days_order, ordered=True)
    
    # Sum minutes per Day/Hour
    heatmap_data = df.groupby(['day_of_week', 'hour'])['minutes_watched'].sum().unstack(fill_value=0)
    
    # Reindex to ensure full 24x7 grid
    heatmap_data = heatmap_data.reindex(days_order)
    heatmap_data = heatmap_data.reindex(columns=range(24), fill_value=0)

    # Plot
    plt.figure(figsize=(12, 6))
    sns.heatmap(heatmap_data, cmap='magma', linewidths=0.5, linecolor='white', annot=False, fmt=".0f", cbar_kws={'label': 'Total Minutes Watched'})
    
    plt.title('True Usage: Minutes Watched')
    plt.xlabel('Hour of Day')
    plt.ylabel('Day of Week')
    plt.tight_layout()
    plt.show()

In [None]:
if __name__ == "__main__":
    df = parse_and_process_history(file_path)
    if df is not None:
        total_hours = df['minutes_watched'].sum() / 60
        print(f"Total Estimated Watch Time: {total_hours:.2f} hours")
        plot_minutes_heatmap(df)

# DATA PREPROCESSING

In [None]:
def build_dataset_robust(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    matches = re.findall(r"Date:\s*(\d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2})\s*UTC", content)
    df = pd.DataFrame(matches, columns=['timestamp'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['local_time'] = df['timestamp'].dt.tz_localize('UTC').dt.tz_convert(TARGET_TIMEZONE)
    df = df.sort_values('local_time').reset_index(drop=True)
    
    # Features
    df['date'] = df['local_time'].dt.date
    df['hour'] = df['local_time'].dt.hour
    df['is_weekend'] = df['local_time'].dt.dayofweek >= 5
    
    # Sabotage Definitions
    df['is_sleep_sabotage'] = df['hour'].isin([2, 3, 4, 5, 6, 7])
    df['is_work_sabotage'] = (~df['is_weekend']) & (df['hour'].between(9, 18))
    df['is_morning_trigger'] = df['hour'].isin([7, 8, 9, 10])

    # Aggregation
    daily = df.groupby('date').agg(
        total_clicks=('timestamp', 'count'),
        late_night_clicks=('is_sleep_sabotage', 'sum'),
        work_hour_clicks=('is_work_sabotage', 'sum'),
        morning_clicks=('is_morning_trigger', 'sum')
    ).reset_index()
    
    daily['day_of_week'] = pd.to_datetime(daily['date']).dt.dayofweek
    
    # Scoring Logic (Multipliers)
    def calculate_score(row):
        sleep_pen = row['late_night_clicks'] * 3.0
        if row['day_of_week'] >= 5: # Weekend
            return sleep_pen + (row['total_clicks'] * 0.2) 
        else: # Weekday
            return sleep_pen + (row['work_hour_clicks'] * 2.0) + (row['total_clicks'] * 0.05)

    daily['raw_score'] = daily.apply(calculate_score, axis=1)
    
    # *** THE FIX: SMOOTHING & OUTLIER CAPPING ***
    
    # 1. Cap Outliers (Winsorization at 95%)
    # This ensures a random 10,000 score doesn't ruin the threshold
    cap_value = daily['raw_score'].quantile(0.95)
    daily['capped_score'] = np.where(daily['raw_score'] > cap_value, cap_value, daily['raw_score'])
    
    # 2. Exponential Smoothing (Span=3 days)
    # This focuses on the HABIT, not the SPIKE.
    daily['smoothed_score'] = daily['capped_score'].ewm(span=3).mean()
    
    # 3. Define "General Bad Day" Threshold
    # We use the 70th percentile of the SMOOTHED data.
    # This defines a "Bad Phase" rather than a random bad event.
    ROBUST_THRESHOLD = daily['smoothed_score'].quantile(0.4)
    daily['is_bad_habit'] = (daily['smoothed_score'] >= ROBUST_THRESHOLD).astype(int)
    
    print(f"Outlier Cap Value: {cap_value:.1f}")
    print(f"Robust Threshold (Smoothed): {ROBUST_THRESHOLD:.1f}")
    
    # Feature Engineering
    daily['prev_score'] = daily['smoothed_score'].shift(1)
    daily['volatility'] = daily['smoothed_score'].rolling(window=5).std().shift(1)
    daily['days_since_peak'] = 0 # Placeholder for advanced logic if needed
    
    daily_clean = daily.dropna().copy()
    
    # Select Features
    features = ['day_of_week', 'prev_score', 'morning_clicks', 'volatility']
    X = daily_clean[features]
    y = daily_clean['is_bad_habit']
    
    return X, y, daily_clean, ROBUST_THRESHOLD

# --- 2. EXECUTE PIPELINE ---
X, y, daily_data, FINAL_THRESHOLD = build_dataset_robust(file_path)

# Check Class Balance
print(f"Class Balance (Bad Habit %): {y.mean():.1%}")

# EXPLORATORY DATA ANALYSIS

In [None]:
plt.figure(figsize=(18, 5))

# Plot 1: Distribution of the SMOOTHED Score
# We plot 'smoothed_score' because that is what the model actually learns from.
# It should look less jagged than the raw score.
plt.subplot(1, 3, 1)
sns.histplot(daily_data['smoothed_score'], bins=20, kde=True, color='purple')
plt.axvline(FINAL_THRESHOLD, color='red', linestyle='--', label=f'Threshold ({FINAL_THRESHOLD:.0f})')
plt.title('Distribution of Robust Usage Scores (Smoothed)')
plt.xlabel('Smoothed Habit Score')
plt.legend()

# Plot 2: Correlation Heatmap
# Updated to use the new feature names (prev_score, morning_clicks, volatility)

plt.subplot(1, 3, 2)
# We include 'late_night_clicks' to see how much it drives the score
corr_cols = ['smoothed_score', 'prev_score', 'morning_clicks', 'volatility', 'late_night_clicks']
sns.heatmap(daily_data[corr_cols].corr(), annot=True, cmap='magma', fmt=".2f")
plt.title('Feature Correlation Matrix')


# Plot 3: Day of Week Impact
# Updated target variable to 'is_bad_habit'
plt.subplot(1, 3, 3)
sns.barplot(x='day_of_week', y='is_bad_habit', data=daily_data, palette='magma', ci=None)
plt.xticks(ticks=range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.title('Probability of "Bad Habit" by Day')
plt.ylabel('Risk Probability')

plt.tight_layout()
plt.show()

# MODEL TRAINING AND VISUALIZATION

In [None]:
# --- 3. TRAIN ENSEMBLE ---
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

clf1 = LogisticRegression(class_weight='balanced', random_state=42)
clf2 = RandomForestClassifier(n_estimators=200, max_depth=5, class_weight='balanced', random_state=42)
# Adjusted XGBoost for Smoothed Target
clf3 = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.05, eval_metric='logloss', random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2), ('xgb', clf3)],
    voting='soft'
)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.15, random_state=42, stratify=y)

voting_clf.fit(X_train, y_train)

In [None]:
# --- 4. EVALUATION ---
y_proba = voting_clf.predict_proba(X_test)[:, 1]

# Optimize Threshold
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_thresh = thresholds[np.argmax(f1_scores)]

y_pred = (y_proba >= best_thresh).astype(int)

print("\n" + "="*40)
print("FINAL ROBUST METRICS")
print("="*40)
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Purples')
plt.title('Confusion Matrix (Smoothed Target)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
joblib.dump(voting_clf, 'tiktok_voting_model.pkl')
joblib.dump(scaler, 'tiktok_scaler.pkl')
joblib.dump(FINAL_THRESHOLD, 'robust_threshold.pkl')
joblib.dump(best_thresh, 'decision_threshold.pkl')
print("Robust Model Saved.")