In [39]:
!pip install "dask[complete]"
!pip install pyarrow pandas



In [40]:
import pandas as pd
import dask.dataframe as dd
import glob
from google.colab import drive


In [41]:

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:

# Directory containing the data files (assuming they are in parquet format)
data_dir = '/content/drive/MyDrive/Data_hasking'

# Get a list of all parquet files for the 10 days
file_list = sorted(glob.glob(f"{data_dir}/*.parquet"))[:10]  # Adjust the slicing if needed

# Load all parquet files into a Dask DataFrame
ddf = dd.read_parquet(file_list)

# Compute the DataFrame to get a Pandas DataFrame
df = ddf.compute()


In [43]:

# Convert VALUEDATE to datetime
df['VALUEDATE'] = pd.to_datetime(df['VALUEDATE'], errors='coerce')


In [44]:
# Group transactions by customer and 3-day intervals
df['transaction_period'] = df['VALUEDATE'].dt.floor('3D')
transactions_by_customer = df.groupby(['CUST_CUSTNO', 'transaction_period'])['VALUEDATE'].count().reset_index(name='transaction_count')


In [46]:
transactions_by_customer[transactions_by_customer['transaction_count'] >= 300]

Unnamed: 0,CUST_CUSTNO,transaction_period,transaction_count
226,000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d...,2023-07-31,3102
227,000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d...,2023-08-03,4802
228,000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d...,2023-08-06,3987
229,000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d...,2023-08-09,3891
13383,038e1ec61bdafa613e469cd6dfa768d6c4e85820adfdb1...,2023-08-03,497
...,...,...,...
950296,fbfecc17f7ad27f762e1675b71fd332cd4d90d07998785...,2023-08-06,1490
950297,fbfecc17f7ad27f762e1675b71fd332cd4d90d07998785...,2023-08-09,1237
960478,febc0044001bed1df6ccdba70e19d51170c757ff9b5912...,2023-08-03,632
960479,febc0044001bed1df6ccdba70e19d51170c757ff9b5912...,2023-08-06,605


In [45]:
# Identify customers with frequent high transactions
frequent_high_transactors = transactions_by_customer[transactions_by_customer['transaction_count'] >= 300]['CUST_CUSTNO'].unique()


In [47]:
frequent_high_transactors


<ArrowStringArray>
['000f99e1d656961a9b6c86837647f0b7aae4a67b2e321d8a39bd6d7a4122b5ef',
 '038e1ec61bdafa613e469cd6dfa768d6c4e85820adfdb14a8c05507904de63d4',
 '03b89f003aa2d9fddb0e651fc1703eca0bc8dd2446e9b7e78e3738325b64bea0',
 '045f9f9223bc1ae42cfcd950a93c5d4f645c25b2370f41cf934881e3aeb3f8a4',
 '06820916cbc2b5288b1a4bab9c33353ae2d4663412d649fdd94093531be9fb46',
 '0817ad1db4e6f92db5eaaf1c35a3c8d74d7aee2f472c87c269390266dc53e4bb',
 '090b24e00336e8d1b7b46c91dd73a1af61a50f7a0bd6e7b0693592a61d99e5d8',
 '097585d1fe09d306fdd731023196eee61025c5c0011e844473ccd6766f79c28e',
 '0abeba6f01699c3f1be4a2718130acf6376fde4fa2460dc40eef62a72ce2783a',
 '0d687c176172c6450adcc1f064f30eabb27ff3b2af80490a23c42c2922e45d59',
 ...
 'f407a5d8bb03c8c52d5be5935a443ccfee8326ba09c1dd9f162f97c4e92e35fd',
 'f6edf4c393ff362944659129046bfc0c8742ea4f81d32d22e3096a3c4b00ee4e',
 'f88658460c792c72605b63e3176c6c6e42fbe7cb85cf29ad218829b659075376',
 'f8fa376e6a86248748ff3c17be48f61bfa014bacbdf87400c4cb1dd73763e853',
 'f904437c