In [None]:
# Setup
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
from pathlib import Path

# Import nettoyage
from load_data import load_data
from cleaning import (
    clean_dataframe,
    make_default_config,
    filter_by_text_quality,
    filter_stop_tags,
    filter_by_user_density,
    add_spatial_density_flag,
)

print("‚úì Imports OK")

## 1. Chargement et nettoyage de base

In [None]:
# Charger donn√©es brutes
df_raw, load_report = load_data('../data/flickr_data2.csv')
print(f"Dataset brut: {len(df_raw):,} lignes")

In [None]:
# Nettoyage de base (conservatif)
config = make_default_config()
config.sample_n = 50000  # √âchantillon pour demo rapide

df_clean, report = clean_dataframe(df_raw, config)

print(f"\nDataset nettoy√©: {len(df_clean):,} lignes")
print(f"Colonnes: {list(df_clean.columns)}")

## 2. Statistiques de base

Analyse du dataset nettoy√© (baseline)

In [None]:
print("=" * 60)
print("BASELINE - Dataset nettoy√© conservatif")
print("=" * 60)

print(f"\nTaille: {len(df_clean):,} photos")
print(f"GPS valides: {(~df_clean[['lat','lon']].isna().any(axis=1)).sum():,}")
print(f"Dates valides: {df_clean['has_valid_date'].sum():,}")
print(f"Texte non-vide: {(df_clean['text_merged'] != '').sum():,}")
print(f"Tags non-vides: {(df_clean['tags_clean'] != '').sum():,}")

# Utilisateurs
print(f"\nUtilisateurs uniques: {df_clean['user_id'].nunique():,}")
user_counts = df_clean['user_id'].value_counts()
print(f"Photos par user - m√©diane: {user_counts.median():.0f}")
print(f"Photos par user - top 3: {user_counts.head(3).to_dict()}")

# Texte
word_counts = df_clean['text_merged'].str.split().str.len()
print(f"\nMots par photo - moyenne: {word_counts.mean():.1f}")
print(f"Mots par photo - m√©diane: {word_counts.median():.0f}")

## 3. NIVEAU 1 - Filtrage qualit√© texte

**Cas d'usage**: Description s√©mantique des clusters

**Justification**: Photos sans tags/titre n'apportent rien au text mining

In [None]:
# Cr√©er vue "text_ready" = photos avec info s√©mantique
df_text_ready = filter_by_text_quality(
    df_clean,
    min_words=3,
    require_tags=False
)

print(f"\nR√©sultat:")
print(f"  Baseline: {len(df_clean):,} photos")
print(f"  Text-ready: {len(df_text_ready):,} photos")
print(f"  Perte: {len(df_clean) - len(df_text_ready):,} ({(1 - len(df_text_ready)/len(df_clean))*100:.1f}%)")

print("\nüí° Usage: Garder df_clean pour clustering, df_text_ready pour TF-IDF")

## 4. NIVEAU 1 - Filtrage stop-tags

**Cas d'usage**: TF-IDF plus discriminant

**Justification**: 'lyon', 'france', 'photo' n'aident pas √† diff√©rencier les zones

In [None]:
# Analyser top tags AVANT filtrage
all_tags_before = ' '.join(df_text_ready['tags_clean'].dropna()).split()
from collections import Counter
top_before = Counter(all_tags_before).most_common(20)

print("Top 20 tags AVANT filtrage:")
for i, (tag, count) in enumerate(top_before, 1):
    print(f"{i:2}. {tag:20} : {count:,}")

In [None]:
# Appliquer stop-tags
df_filtered_tags = filter_stop_tags(
    df_text_ready.copy(),
    stop_tags=['lyon', 'france', 'photo', 'photos', 'flickr', 'city', 'ville']
)

# Top tags APR√àS
all_tags_after = ' '.join(df_filtered_tags['tags_clean'].dropna()).split()
top_after = Counter(all_tags_after).most_common(20)

print("\nTop 20 tags APR√àS filtrage:")
for i, (tag, count) in enumerate(top_after, 1):
    print(f"{i:2}. {tag:20} : {count:,}")

print("\n‚úÖ Tags plus discriminants ‚Üí meilleur TF-IDF")

## 5. NIVEAU 2 - Gestion utilisateurs hyper-actifs

**Cas d'usage**: √âviter biais densit√© (1 user = 5000 photos m√™me lieu)

**Justification**: Repr√©sentativit√© vs sur-repr√©sentation individuelle

In [None]:
# Analyser distribution users
user_dist = df_text_ready['user_id'].value_counts()

print("Distribution photos/user:")
print(f"  Min: {user_dist.min()}")
print(f"  M√©diane: {user_dist.median():.0f}")
print(f"  Moyenne: {user_dist.mean():.1f}")
print(f"  Max: {user_dist.max()}")
print(f"  P90: {user_dist.quantile(0.9):.0f}")
print(f"  P95: {user_dist.quantile(0.95):.0f}")
print(f"  P99: {user_dist.quantile(0.99):.0f}")

# Identifier heavy users
heavy_threshold = 500
heavy = user_dist[user_dist > heavy_threshold]
print(f"\n{len(heavy)} utilisateurs avec > {heavy_threshold} photos")
print(f"Repr√©sentent {heavy.sum():,} photos ({heavy.sum()/len(df_text_ready)*100:.1f}% du dataset)")

In [None]:
# Limiter heavy users
df_balanced = filter_by_user_density(
    df_text_ready.copy(),
    max_photos_per_user=500,
    strategy='sample'  # ou 'limit'
)

print(f"\n‚öñÔ∏è  Dataset r√©√©quilibr√©:")
print(f"  Avant: {len(df_text_ready):,}")
print(f"  Apr√®s: {len(df_balanced):,}")
print(f"  Perte: {len(df_text_ready) - len(df_balanced):,}")

print("\n‚ö†Ô∏è  √Ä documenter: impact sur repr√©sentativit√© spatiale")

## 6. NIVEAU 2 - Densit√© spatiale (flag isol√©s)

**Cas d'usage**: Identifier photos isol√©es vs en cluster

**Important**: Ne supprime PAS, juste flagge pour analyse diff√©renci√©e

In [None]:
# Installer sklearn si besoin
try:
    import sklearn
    print("‚úì sklearn disponible")
except ImportError:
    print("‚ö†Ô∏è  sklearn non install√©, skipper cette section")
    print("   Installation: pip install scikit-learn")

In [None]:
# Ajouter flag densit√©
df_with_density = add_spatial_density_flag(
    df_balanced.copy(),
    eps_km=0.5,  # 500m
    min_samples=5
)

# Analyser
dense_count = df_with_density['is_dense'].sum()
isolated_count = (~df_with_density['is_dense']).sum()

print(f"\nR√©sultat:")
print(f"  Photos denses (en cluster): {dense_count:,} ({dense_count/len(df_with_density)*100:.1f}%)")
print(f"  Photos isol√©es (noise): {isolated_count:,} ({isolated_count/len(df_with_density)*100:.1f}%)")

print("\nüí° Usage:")
print("  - Garder tout pour analyse globale")
print("  - Filtrer isol√©es pour focus sur POI majeurs")
print("  - Analyser isol√©es s√©par√©ment (√©v√©nements ponctuels?)")

## 7. Synth√®se - Choix m√©thodologiques

### Tableau r√©capitulatif

In [None]:
# Cr√©er tableau synth√®se
synthese = pd.DataFrame([
    {
        'Version': 'Baseline (conservatif)',
        'Photos': len(df_clean),
        'GPS': (~df_clean[['lat','lon']].isna().any(axis=1)).sum(),
        'Dates': df_clean['has_valid_date'].sum(),
        'Texte': (df_clean['text_merged'] != '').sum(),
        'Usage': 'Clustering spatial',
    },
    {
        'Version': '+ Filtre texte',
        'Photos': len(df_text_ready),
        'GPS': (~df_text_ready[['lat','lon']].isna().any(axis=1)).sum(),
        'Dates': df_text_ready['has_valid_date'].sum(),
        'Texte': (df_text_ready['text_merged'] != '').sum(),
        'Usage': 'Description zones',
    },
    {
        'Version': '+ Stop-tags',
        'Photos': len(df_filtered_tags),
        'GPS': (~df_filtered_tags[['lat','lon']].isna().any(axis=1)).sum(),
        'Dates': df_filtered_tags['has_valid_date'].sum(),
        'Texte': (df_filtered_tags['text_merged'] != '').sum(),
        'Usage': 'TF-IDF optimis√©',
    },
    {
        'Version': '+ √âquilibrage users',
        'Photos': len(df_balanced),
        'GPS': (~df_balanced[['lat','lon']].isna().any(axis=1)).sum(),
        'Dates': df_balanced['has_valid_date'].sum(),
        'Texte': (df_balanced['text_merged'] != '').sum(),
        'Usage': 'Densit√© non-biais√©e',
    },
])

print("\n" + "="*80)
print("SYNTH√àSE - Versions du dataset")
print("="*80)
print(synthese.to_string(index=False))

print("\n" + "="*80)
print("RECOMMANDATION")
print("="*80)
print("""
1. Clustering spatial (KMeans/DBSCAN/Hierarchical):
   ‚Üí Utiliser BASELINE (conservatif, max donn√©es)

2. Text mining (TF-IDF, association rules):
   ‚Üí Utiliser + Stop-tags (tags discriminants)

3. Analyse temporelle:
   ‚Üí Utiliser BASELINE + filtrer sur 'has_valid_date'

4. Validation milestones:
   ‚Üí Tester AVEC et SANS filtres avanc√©s
   ‚Üí Documenter impact sur r√©sultats
   ‚Üí Justifier choix selon objectif

‚úÖ Approche d√©fendable √† l'oral:
   "Nettoyage conservatif + filtres optionnels document√©s"
""")

## 8. Export des versions

Sauvegarder les diff√©rentes versions pour analyse ult√©rieure

In [None]:
# Cr√©er r√©pertoire versions
versions_dir = Path('../data/versions')
versions_dir.mkdir(exist_ok=True)

# Sauvegarder (optionnel)
# df_clean.to_parquet(versions_dir / 'baseline.parquet')
# df_text_ready.to_parquet(versions_dir / 'text_ready.parquet')
# df_filtered_tags.to_parquet(versions_dir / 'filtered_tags.parquet')
# df_balanced.to_parquet(versions_dir / 'balanced.parquet')

print("‚úì Versions pr√™tes pour analyse")
print(f"  Baseline: {len(df_clean):,} photos")
print(f"  Text-ready: {len(df_text_ready):,} photos")
print(f"  Filtered-tags: {len(df_filtered_tags):,} photos")
print(f"  Balanced: {len(df_balanced):,} photos")

---

## Conclusion

### Ce notebook d√©montre:

1. ‚úÖ **Cleaning conservatif de base** = optimal pour projet acad√©mique
2. ‚úÖ **Filtres optionnels** = adaptables selon objectif analyse
3. ‚úÖ **Tra√ßabilit√© compl√®te** = justification m√©thodologique
4. ‚úÖ **Flexibilit√©** = tester plusieurs approches

### Pour les milestones suivants:

- **Milestone 1** (Exploration): Utiliser baseline + visualisations
- **Milestone 2** (Clustering): Tester baseline vs balanced
- **Milestone 3** (Text mining): Utiliser filtered_tags
- **Milestone 4** (Temporel): Filtrer sur has_valid_date

### R√©ponse jury:

> *"Peut-on nettoyer davantage?"*

**‚Üí** Oui, mais nous avons volontairement choisi un cleaning conservatif pour ne pas perdre d'information utile. Des nettoyages plus agressifs (filtrage s√©mantique, pond√©ration utilisateurs, filtrage par densit√©) ont √©t√© identifi√©s comme pistes d'am√©lioration et peuvent √™tre activ√©s selon l'objectif (zones touristiques vs √©v√©nements).