In [2]:
import pandas as pd
df=pd.read_csv('filtered_maids_pingsink.csv')

# Create bins for pingsink scores
bins = [0, 0.2, 0.5, 0.75, 0.9, 1.0]
labels = ['0-20%', '20-50%', '50-75%', '75-90%', '90-100%']

# Add a new column with binned pingsink scores
df['pingsink_bin'] = pd.cut(df['pingsink'], bins=bins, labels=labels, right=True)

# Filter to show only home, work, leisure categories
categories_of_interest = ['home', 'work', 'leisure']
df_filtered = df[df['category'].isin(categories_of_interest)]

# Create a pivot table to count geohashes for each category and pingsink bin
pingsink_stats = pd.pivot_table(
    data=df_filtered,
    index='category',
    columns='pingsink_bin',
    values='geohash',
    aggfunc='nunique',
    fill_value=0
)

# Reindex to ensure all categories are present
pingsink_stats = pingsink_stats.reindex(categories_of_interest)

# Calculate percentage distribution across pingsink bins for each category
pingsink_pct = pingsink_stats.div(pingsink_stats.sum(axis=1), axis=0) * 100

# Display the statistics tables
print("Count of unique geohashes by category and pingsink:")
display(pingsink_stats)

print("\nPercentage distribution by category and pingsink:")
display(pingsink_pct.round(2))

Count of unique geohashes by category and pingsink:


  pingsink_stats = pd.pivot_table(


pingsink_bin,0-20%,20-50%,50-75%,75-90%,90-100%
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
home,21,53,69,9,49
work,8,34,53,6,46
leisure,2,49,127,20,76



Percentage distribution by category and pingsink:


pingsink_bin,0-20%,20-50%,50-75%,75-90%,90-100%
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
home,10.45,26.37,34.33,4.48,24.38
work,5.44,23.13,36.05,4.08,31.29
leisure,0.73,17.88,46.35,7.3,27.74


In [6]:
pd.set_option('display.max_columns', None)
s=0.6
e=0.9
sugges_pingsink=df[(df['pingsink']>s) & (df['pingsink']<e)].sort_values(by=['pingsink','confidence'],ascending=False)

In [8]:
import plotly.graph_objects as go
sample_pings=sugges_pingsink.sample(min(10,len(sugges_pingsink)))

for i in range(len(sample_pings)):
    p=sample_pings.reset_index()[i:i+1]
    fig = go.Figure()

    # Add scatter points for pingsink locations
    fig.add_trace(go.Scattermapbox(
        lat=p['lat'],
        lon=p['lon'],
        mode='markers',
        marker=dict(
            size=10,
            color=p['pingsink'],
            colorscale='Reds',
            showscale=False,
            colorbar=dict(title="Pingsink Score")
        ),
        text=[f"Category: {cat}<br>Confidence: {conf:.3f}<br>Pingsink: {ps:.3f}<br>Spread: {spread:.3f}<br>Pings: {pings}" 
              for cat, conf, ps, spread, pings in zip(p['category'], 
                                         p['confidence'], 
                                         p['pingsink'],
                                         p['spread'],
                                         p['pings'])],
        hoverinfo='text'
    ))

    fig.update_layout(
        mapbox=dict(
            style="open-street-map",
            center=dict(
                lat=p['lat'].mean(),
                lon=p['lon'].mean()
            ),
            zoom=17
        ),
        title=f"Suggested Pingsink Location {i+1}",
        height=600
    )

    fig.show()



*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [9]:
import pandas as pd
sample=pd.read_pickle('/home/hieu/Work/new_casacom/data/processed_all/__1TxI62aHPiUk7txktnoGVAmLbn8eja_sdW5GW17Mie3B_g0n78C8LBDA9_YTQ7.pkl')

In [18]:
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')

In [23]:
db=client['casacom']
db['maids'].drop()
collection = db['maids']

In [None]:
import json
from datetime import datetime, date
import pytz
try:
    import pandas as pd
except ImportError:
    pd = None

from pymongo import MongoClient
from bson import json_util 

def clean_data(obj):
    """
    Recursively clean for MongoDB:
    - Convert non-string dict keys to str(k).
    - Convert set to list.
    - Convert date to ISO string.
    - Convert pd.Timestamp to datetime.
    """
    if isinstance(obj, dict):
        new_dict = {}
        for k, v in obj.items():
            str_key = str(k) if not isinstance(k, str) else k
            new_dict[str_key] = clean_data(v)
        return new_dict
    elif isinstance(obj, list):
        return [clean_data(item) for item in obj]
    elif isinstance(obj, set):
        return list(obj)    
    elif isinstance(obj, date):
        return obj.isoformat()
    elif pd and isinstance(obj, pd.Timestamp):
        return obj.to_pydatetime()
    elif isinstance(obj, datetime):
        if obj.tzinfo is None:
            obj = obj.replace(tzinfo=pytz.UTC)
        return obj
    else:
        return obj
sample_copy = sample

cleaned_doc = clean_data(sample_copy)

result = collection.replace_one(
    {'_id': cleaned_doc['maid']},
    cleaned_doc,
    upsert=True
)

In [None]:
#data phai la list,phai la datetime, toàn bộ phải là json,key phai la str    

In [25]:
import glob
ds=glob.glob('/home/hieu/Work/new_casacom/data/processed_all/*')
docs=[pd.read_pickle(d) for d in ds]

In [27]:
client = MongoClient('mongodb://localhost:27017/')
db = client['store_db']
collection = db['stores']

In [None]:
from pymongo import UpdateOne
ops = []
for d in docs:
    cleaned_doc = clean_data(d)
    op = UpdateOne(
        {'_id': cleaned_doc['maid']},
        {'$set': cleaned_doc},
        upsert=True
    )
    ops.append(op)
collection.bulk_write(ops)
client.close()
