In [None]:
import pandas as pd
import datetime
import glob
import os


input_dir = "Path for the raw dataset directory"
output_dir = "Path for the CSVs directory"
os.makedirs(output_dir, exist_ok=True)

# Column names
col_list = ['squareid', 'datetime', 'countryCode', 'smsin', 'smsout', 'callin', 'callout', 'internet']

# All text files
filenames = glob.glob(f"{input_dir}/*.txt")
print(f"Found {len(filenames)} TXT files.")

# Process each file one by one
for file in filenames:
    print(f"Processing {file} ...")

    # Read
    df = pd.read_csv(file, sep='\t', header=None, names=col_list)

    # Convert time
    df['datetime'] = pd.to_datetime(df.datetime, unit='ms', utc=True)\
                        .dt.tz_convert('CET').dt.tz_localize(None)
    #df = df.fillna(0)

    # Drop unused
    #df.drop(columns=['countryCode'], inplace=True)

    # Write each square's data directly to its own CSV
    for square_id, group in df.groupby('squareid'):
        out_path = f"{output_dir}/square{square_id}.csv"


        group.to_csv(out_path, mode='a', header=not os.path.exists(out_path), index=False)

print("✅ Done! All squares saved.")


Found 62 TXT files.
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-01.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-02.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-03.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-04.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-05.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-06.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-07.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-08.txt ...
Processing /content/drive/MyDrive/Eventum Solutions/Telecom_TN_Data/sms-call-internet-tn-2013-11-09.txt ...
Processi

# Checking if all the squares are included

In [None]:
import os
import glob

folder_path = "Path of the CSVs directory"

# Count CSV files
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
print(f"Number of CSV files: {len(csv_files)}")

Number of CSV files: 6259


# Checking with a random square CSV file

In [None]:
sq = pd.read_csv("square38.csv")

In [None]:
sq['datetime'].min()

'2013-11-01 00:00:00'

In [None]:
sq['datetime'].max()

'2014-01-01 23:50:00'

# Check if all the sqaures have the same date and time range or some squares have missing data.

In [None]:
import pandas as pd
import glob
import os


output_dir = "Path of csvs"
expected_start = pd.Timestamp("2013-11-01 00:00:00")
expected_end   = pd.Timestamp("2014-01-01 23:50:00")

# Check each CSV
csv_files = glob.glob(os.path.join(output_dir, "*.csv"))
print(f"Found {len(csv_files)} CSV files.")

problems = []

for file in csv_files:
    try:
        df = pd.read_csv(file, parse_dates=['datetime'])
        min_dt = df['datetime'].min()
        max_dt = df['datetime'].max()

        if (min_dt != expected_start) or (max_dt != expected_end):
            problems.append((os.path.basename(file), min_dt, max_dt))

    except Exception as e:
        print(f"❌ Error reading {file}: {e}")

# ==== Report ====
if problems:
    print("\n⚠️ Files with unexpected datetime ranges:")
    for name, min_dt, max_dt in problems:
        print(f"{name}: min={min_dt}, max={max_dt}")
else:
    print("\n✅ All files match the expected datetime range.")


Found 6259 CSV files.

⚠️ Files with unexpected datetime ranges:
square6403.csv: min=2013-11-01 00:20:00, max=2014-01-01 23:40:00
square6993.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square6995.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7110.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7111.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7112.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7232.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7233.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7235.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7467.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7585.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7586.csv: min=2013-11-01 08:00:00, max=2014-01-01 23:50:00
square7644.csv: min=2013-11-01 01:30:00, max=2014-01-01 23:40:00
square7849.csv: min=2013-11-01 05:00:00, max=2014-01-01 23:50:00
square8546.csv: min=2013-