In [63]:
import sys
import os
from google.colab import drive

# 1. Mount the drive
drive.mount('/content/drive')

# 2. Define the EXACT path to your project root
project_root = "/content/drive/MyDrive/OmniScale-Optimizer"

# 3. Add to sys.path if not already there
if project_root not in sys.path:
    sys.path.append(project_root)

# 4. Change the working directory
os.chdir(project_root)

print(f"Current Working Directory: {os.getcwd()}")
print("Python Path updated. You can now import from 'src'.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current Working Directory: /content/drive/MyDrive/OmniScale-Optimizer
Python Path updated. You can now import from 'src'.


In [None]:
from google.colab import drive
import os
from google.colab import auth
auth.authenticate_user()

drive.mount('/content/drive', force_remount=True)

# Create a main project folder in your Drive
project_path = "/content/drive/MyDrive/OmniScale-Optimizer"
if not os.path.exists(project_path):
    os.makedirs(project_path)
    print(f"Created project folder at {project_path}")

# Change the current working directory to the project folder
os.chdir(project_path)

folders = [
    "data/raw",
    "data/processed",
    "notebooks",
    "src/parser",
    "src/models",
    "src/optimizer/cpp_core",
    "src/distributed",
    "tests",
    "scripts"
]

for folder in folders:
    path = os.path.join(project_path, folder)
    os.makedirs(path, exist_ok=True)

# Create empty __init__.py files to make them Python modules

init_files = [
    "src/__init__.py",

    "src/parser/__init__.py",
    "src/models/__init__.py",
    "src/optimizer/__init__.py",
    "src/distributed/__init__.py"
]

for file in init_files:
    with open(os.path.join(project_path, file), 'w') as f:
        pass

print("Project structure created successfully!")

In [22]:
%%writefile src/parser/stream_parser.py
import json

def stream_amazon_data(file_path):
    """A generator that yields one row at a time (Phase 1: Parsing)"""
    with open(file_path, 'r') as f:
        for line in f:
            yield json.loads(line)

if __name__ == "__main__":
    print("Stream Parser Module Initialized")

Overwriting src/parser/stream_parser.py


In [24]:

# Check if project folder exists and list its contents
project_path = "/content/drive/MyDrive/OmniScale-Optimizer"
if os.path.exists(project_path):
    print(f"Contents of {project_path}:")
    for item in os.listdir(project_path):
        print(f"  {item}")
else:
    print("Folder doesn't exist yet")


Contents of /content/drive/MyDrive/OmniScale-Optimizer:
  data
  notebooks
  src
  tests
  scripts


In [27]:
!ls -la /content/drive/MyDrive/OmniScale-Optimizer/

total 20
drwx------ 2 root root 4096 Feb 20 03:48 data
drwx------ 2 root root 4096 Feb 20 03:48 notebooks
drwx------ 2 root root 4096 Feb 20 03:48 scripts
drwx------ 3 root root 4096 Feb 20 03:49 src
drwx------ 2 root root 4096 Feb 20 03:48 tests


In [29]:
import os
print("Current directory:", os.getcwd())
print("Contents:", os.listdir('.'))


Current directory: /content/drive/MyDrive/OmniScale-Optimizer
Contents: ['data', 'notebooks', 'src', 'tests', 'scripts']


In [30]:
!pip install pybind11

Collecting pybind11
  Downloading pybind11-3.0.2-py3-none-any.whl.metadata (10 kB)
Downloading pybind11-3.0.2-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.2/310.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pybind11
Successfully installed pybind11-3.0.2


In [31]:
from google.colab import drive
import sys
import os

# 1. Mount Drive (Look for a notification/popup in your browser window if it hangs)
drive.mount('/content/drive')

# 2. Define the project path
project_root = '/content/drive/MyDrive/OmniScale-Optimizer'
sys.path.append(project_root)

# 3. Change directory so relative imports work
os.chdir(project_root)

print(f"Current Working Directory: {os.getcwd()}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current Working Directory: /content/drive/MyDrive/OmniScale-Optimizer


In [33]:
import os
# Make sure your drive is mounted first!
raw_data_path = "/content/drive/MyDrive/OmniScale-Optimizer/data/raw"
os.chdir(raw_data_path)
print(f"Current directory: {os.getcwd()}")

Current directory: /content/drive/MyDrive/OmniScale-Optimizer/data/raw


In [35]:
# Download the 5-core Electronics dataset (approx 400MB compressed, 1.5GB uncompressed)
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz

--2026-02-20 04:28:25--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 495854086 (473M) [application/x-gzip]
Saving to: ‘reviews_Electronics_5.json.gz.1’


2026-02-20 04:28:42 (29.1 MB/s) - ‘reviews_Electronics_5.json.gz.1’ saved [495854086/495854086]



In [36]:
# This unzips the file into a .json file
!gunzip reviews_Electronics_5.json.gz

In [37]:
!ls -lh

total 1.9G
-rw------- 1 root root 1.4G Apr 26  2016 reviews_Electronics_5.json
-rw------- 1 root root 473M Apr 26  2016 reviews_Electronics_5.json.gz.1


In [39]:

import json

# Count records in JSON file
count = 0
seen = set()
duplicates = 0

with open('/content/drive/MyDrive/OmniScale-Optimizer/data/raw/reviews_Electronics_5.json', 'r') as f:
    for line in f:
        count += 1
        if line.strip():
            try:
                data = json.loads(line)
                review_id = data.get('reviewerID', '')
                if review_id in seen:
                    duplicates += 1
                seen.add(review_id)
            except:
                pass

print(f"Total reviews: {count}")
print(f"Unique reviewers: {len(seen)}")
print(f"Potential duplicates: {duplicates}")


Total reviews: 1689188
Unique reviewers: 192403
Potential duplicates: 1496785


In [40]:

import json

# Check for true duplicates (same reviewerID + asin)
seen_pairs = set()
true_duplicates = 0

with open('/content/drive/MyDrive/OmniScale-Optimizer/data/raw/reviews_Electronics_5.json', 'r') as f:
    for line in f:
        if line.strip():
            try:
                data = json.loads(line)
                pair = (data.get('reviewerID', ''), data.get('asin', ''))
                if pair[0] and pair[1]:  # both exist
                    if pair in seen_pairs:
                        true_duplicates += 1
                    seen_pairs.add(pair)
            except:
                pass

print(f"True duplicates (same user + same product): {true_duplicates}")


True duplicates (same user + same product): 0


In [53]:
# First, let's see what's in your module
import sys
sys.path.insert(0, '/content/drive/MyDrive/OmniScale-Optimizer')

# Check what's exported
import src.parser.stream_parser as sp_module
print("Available in stream_parser.py:")
print([item for item in dir(sp_module) if not item.startswith('_')])

Available in stream_parser.py:
['json', 'stream_amazon_data']


In [69]:
%%writefile src/parser/feature_miner.py
import numpy as np
import pandas as pd
from src.parser.stream_parser import stream_amazon_data

class FeatureMiner:
    def __init__(self, file_path):
        self.file_path = file_path

    def extract_interactions(self, limit=100000):
        """Extracts User-Item-Rating and generates random Lat/Lon for users"""
        data = []
        gen = stream_amazon_data(self.file_path)
        
        for i, record in enumerate(gen):
            if i >= limit: break
            data.append({
                'user_id': record.get('reviewerID'),
                'item_id': record.get('asin'),
                'rating': record.get('overall'),
                'lat': np.random.uniform(30, 45),
                'lon': np.random.uniform(-120, -70)
            })
        return pd.DataFrame(data)

    def manual_kmeans(self, points, k, max_iters=100):
        """K-Means++ implementation from scratch (Data Mining)"""
        centroids = points[np.random.choice(points.shape[0], k, replace=False)]
        for _ in range(max_iters):
            distances = np.linalg.norm(points[:, np.newaxis] - centroids, axis=2)
            labels = np.argmin(distances, axis=1)
            new_centroids = np.array([
                points[labels == i].mean(axis=0) if len(points[labels == i]) > 0 else centroids[i]
                for i in range(k)
            ])
            if np.allclose(centroids, new_centroids):
                break
            centroids = new_centroids
        return centroids, labels

Writing src/parser/feature_miner.py


In [70]:
from src.parser.feature_miner import FeatureMiner

# Path to the dataset you downloaded to your Drive
raw_json = "/content/drive/MyDrive/OmniScale-Optimizer/data/raw/reviews_Electronics_5.json"

# Initialize and Process
miner = FeatureMiner(raw_json)
df_interactions = miner.extract_interactions(limit=100000)

print(f"Extracted {len(df_interactions)} interactions.")

# Cluster user locations (Lat/Lon) into 10 Delivery Zones
points = df_interactions[['lat', 'lon']].values
centroids, labels = miner.manual_kmeans(points, k=10)

# Add the Zone Label to our dataframe
df_interactions['delivery_zone'] = labels

# Save processed data to Drive to save local 3GB space
df_interactions.to_csv("/content/drive/MyDrive/OmniScale-Optimizer/data/processed/clean_data.csv", index=False)
print("Data saved to Drive!")

Extracted 100000 interactions.
Data saved to Drive!


In [72]:
from src.parser.feature_miner import FeatureMiner

# Initialize with your data path
raw_json = "/content/drive/MyDrive/OmniScale-Optimizer/data/raw/reviews_Electronics_5.json"
miner = FeatureMiner(raw_json)

# Test it
df = miner.extract_interactions(limit=100)
print(f"Worked! Extracted {len(df)} rows.")

Worked! Extracted 100 rows.


In [77]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

# 1. Load the data we mined in Phase 1
df = pd.read_csv("/content/drive/MyDrive/OmniScale-Optimizer/data/processed/clean_data.csv")

# 2. Map IDs to Integers
df['user_idx'] = df['user_id'].astype('category').cat.codes
df['item_idx'] = df['item_id'].astype('category').cat.codes

# 3. Get total counts (needed for the model's Embedding layers)
num_users = df['user_idx'].nunique()
num_items = df['item_idx'].nunique()

print(f"Dataset has {num_users} users and {num_items} items.")

# 4. Split into Training and Testing
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Save the counts for later use
with open("/content/drive/MyDrive/OmniScale-Optimizer/data/processed/metadata.txt", "w") as f:
    f.write(f"{num_users},{num_items}")

Dataset has 60814 users and 4181 items.


In [78]:
%%writefile src/models/ncf_model.py
import torch
import torch.nn as nn

class NCFModel(nn.Module):
    def __init__(self, num_users, num_items, embed_size=32):
        super(NCFModel, self).__init__()
        
        # 1. Embedding Layers (Algorithms & DS: O(1) Lookup Tables)
        self.user_embed = nn.Embedding(num_users, embed_size)
        self.item_embed = nn.Embedding(num_items, embed_size)
        
        # 2. Neural Network Layers (MLP)
        self.fc_layers = nn.Sequential(
            nn.Linear(embed_size * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Output is a single "Prediction Score"
        )
        
    def forward(self, user_indices, item_indices):
        # Lookup embeddings
        u_emb = self.user_embed(user_indices)
        i_emb = self.item_embed(item_indices)
        
        # Concatenate user and item vectors
        x = torch.cat([u_emb, i_emb], dim=-1)
        
        # Pass through the neural network
        prediction = self.fc_layers(x)
        return prediction.squeeze()

Writing src/models/ncf_model.py


In [79]:
from src.models.ncf_model import NCFModel
import torch.optim as optim
import torch.nn as nn

# 1. Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Initialize Model
model = NCFModel(num_users, num_items).to(device)

# 3. Define Optimizer (Numerical Optimization) and Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# 4. Prepare Tensors
user_train = torch.LongTensor(train['user_idx'].values).to(device)
item_train = torch.LongTensor(train['item_idx'].values).to(device)
ratings_train = torch.FloatTensor(train['rating'].values).to(device)

# 5. Training Loop
model.train()
for epoch in range(5):  # 5 Epochs for testing
    optimizer.zero_grad()
    
    # Forward Pass
    outputs = model(user_train, item_train)
    loss = criterion(outputs, ratings_train)
    
    # Backward Pass (Numerical Opt: Gradient Calculation)
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# 6. Save Model Weights (Persistent Storage)
torch.save(model.state_dict(), "/content/drive/MyDrive/OmniScale-Optimizer/data/processed/ncf_model.pth")
print("Model saved to Drive!")

Using device: cpu
Epoch 1, Loss: 18.2124
Epoch 2, Loss: 17.9507
Epoch 3, Loss: 17.6857
Epoch 4, Loss: 17.4165
Epoch 5, Loss: 17.1423
Model saved to Drive!


In [80]:
%%writefile src/optimizer/solver.py
import numpy as np

class LogisticsOptimizer:
    def __init__(self, warehouse_locations, warehouse_capacities):
        """
        warehouse_locations: Lat/Lon of centroids from Phase 1
        warehouse_capacities: List of max orders each warehouse can take
        """
        self.warehouses = warehouse_locations
        self.capacities = warehouse_capacities

    def calculate_distance(self, p1, p2):
        """Euclidean distance (HPC Optimization target later!)"""
        return np.linalg.norm(p1 - p2)

    def assign_orders(self, user_locations):
        """
        Numerical Optimization: Assign users to warehouses to minimize distance
        under capacity constraints.
        """
        num_users = len(user_locations)
        assignments = np.full(num_users, -1)
        current_usage = np.zeros(len(self.warehouses))

        # 1. Create a cost matrix (Distance from every user to every warehouse)
        # This is O(N*M) - we will optimize this in C++ in Phase 4!
        for i in range(num_users):
            user_pos = user_locations[i]
            
            # Find distances to all warehouses
            costs = [self.calculate_distance(user_pos, w) for w in self.warehouses]
            
            # Sort warehouses by distance (closest first)
            preferred_warehouses = np.argsort(costs)
            
            # 2. Assignment Logic with Constraint Checking
            for w_idx in preferred_warehouses:
                if current_usage[w_idx] < self.capacities[w_idx]:
                    assignments[i] = w_idx
                    current_usage[w_idx] += 1
                    break
                    
        return assignments, current_usage

Writing src/optimizer/solver.py


In [81]:
from src.optimizer.solver import LogisticsOptimizer
import pandas as pd
import numpy as np

# 1. Load your processed data
df = pd.read_csv("/content/drive/MyDrive/OmniScale-Optimizer/data/processed/clean_data.csv")

# 2. Get Warehouse Locations (The centroids from Phase 1)
# For this demo, we'll re-calculate them or use unique zones
warehouse_locations = df.groupby('delivery_zone')[['lat', 'lon']].mean().values
num_warehouses = len(warehouse_locations)

# 3. Define Constraints (Amazon-style Capacity Limits)
# Each warehouse can only handle 12% of the total orders (making it a tight constraint)
total_orders = len(df)
capacity_per_warehouse = int((total_orders / num_warehouses) * 1.1) 
capacities = [capacity_per_warehouse] * num_warehouses

print(f"Total Orders: {total_orders}")
print(f"Capacity per Warehouse: {capacity_per_warehouse}")

# 4. Initialize and Run the Optimizer
optimizer = LogisticsOptimizer(warehouse_locations, capacities)
user_coords = df[['lat', 'lon']].values

print("Solving assignment optimization...")
assignments, final_usage = optimizer.assign_orders(user_coords)

# 5. Analyze Results
df['assigned_warehouse'] = assignments
unassigned = np.sum(assignments == -1)

print(f"Optimization Complete!")
print(f"Orders unassigned (due to capacity): {unassigned}")
print(f"Warehouse Usage: {final_usage}")

# Save the final optimized plan
df.to_csv("/content/drive/MyDrive/OmniScale-Optimizer/data/processed/final_shipping_plan.csv", index=False)

Total Orders: 100000
Capacity per Warehouse: 11000
Solving assignment optimization...
Optimization Complete!
Orders unassigned (due to capacity): 0
Warehouse Usage: [10129.  9803. 10362.  8851. 10026. 10144. 10774.  9049. 10625. 10237.]


In [6]:
import os
from google.colab import drive

# 1. Mount Drive
drive.mount('/content/drive', force_remount=True)

# 2. Define and Move to Project Root
project_root = "/content/drive/MyDrive/OmniScale-Optimizer"
os.chdir(project_root)

# 3. Force-create the specific subfolder (just in case)
os.makedirs("src/optimizer/cpp_core", exist_ok=True)

print(f"Current Directory: {os.getcwd()}")
# It should print: /content/drive/MyDrive/OmniScale-Optimizer

Mounted at /content/drive
Current Directory: /content/drive/MyDrive/OmniScale-Optimizer


In [19]:
%%writefile src/optimizer/cpp_core/optimizer.cpp
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <vector>
#include <cmath>
#include <algorithm>
#include <omp.h>

namespace py = pybind11;

py::array_t<int> fast_assign(py::array_t<double> user_locs, 
                            py::array_t<double> warehouse_locs, 
                            py::array_t<int> capacities) {
    
    auto users = user_locs.unchecked<2>();
    auto warehouses = warehouse_locs.unchecked<2>();
    
    int n_users = users.shape(0);
    int n_warehouses = warehouses.shape(0);
    
    py::array_t<int> assignments({n_users});
    auto assign_ptr = assignments.mutable_unchecked<1>();

    #pragma omp parallel for
    for (int i = 0; i < n_users; i++) {
        double min_dist = 1e18;
        int best_w = -1;

        for (int j = 0; j < n_warehouses; j++) {
            double dx = users(i, 0) - warehouses(j, 0);
            double dy = users(i, 1) - warehouses(j, 1);
            double dist = std::sqrt(dx*dx + dy*dy);
            
            if (dist < min_dist) {
                min_dist = dist;
                best_w = j;
            }
        }
        assign_ptr(i) = best_w;
    }
    return assignments;
}

PYBIND11_MODULE(fast_optimizer, m) {
    m.def("fast_assign", &fast_assign, "High-performance order assignment");
}

Overwriting src/optimizer/cpp_core/optimizer.cpp


In [20]:
if os.path.exists("src/optimizer/cpp_core/optimizer.cpp"):
    print("✅ File created successfully!")
else:
    print("❌ File still missing. Check if your Drive has space or if the folder name has a typo.")

✅ File created successfully!


In [21]:
import pybind11

# Get include path
inc = pybind11.get_include()

# Compile the .so file
!g++ -O3 -fopenmp -Wall -shared -std=c++11 -fPIC \
    $(python3 -m pybind11 --includes) \
    src/optimizer/cpp_core/optimizer.cpp \
    -o optimizer$(python3-config --extension-suffix)   

In [22]:
%%writefile src/distributed/map_reduce_ops.py
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import sys
import os

# Ensure the C++ library can be found by workers
sys.path.append("/content/drive/MyDrive/OmniScale-Optimizer/src/optimizer/cpp_core")
import fast_optimizer

def worker_task(chunk_data, warehouse_coords, caps):
    """
    The 'Map' Step: Each worker processes a slice of the data.
    In a real system, this would be a separate EC2 instance.
    """
    # Use our HPC C++ kernel inside the worker
    assignments = fast_optimizer.fast_assign(chunk_data, warehouse_coords, caps)
    
    # Calculate local metrics to return to the master
    local_count = len(assignments)
    local_usage = np.bincount(assignments, minlength=len(warehouse_coords))
    
    return {
        "assignments": assignments,
        "usage": local_usage,
        "count": local_count
    }

def run_distributed_optimizer(df, warehouse_coords, caps, num_workers=4):
    """
    The 'Master' Logic: Orchestrates the distribution and aggregation.
    """
    # 1. Split data into shards (Data Partitioning)
    user_coords = df[['lat', 'lon']].values.astype(np.float64)
    chunks = np.array_split(user_coords, num_workers)
    
    results = []
    
    # 2. Parallel Execution (Simulating a Distributed Cluster)
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(worker_task, chunk, warehouse_coords, caps) for chunk in chunks]
        for future in futures:
            results.append(future.result())
            
    # 3. The 'Reduce' Step: Aggregate results from all workers
    total_usage = np.zeros(len(warehouse_coords))
    all_assignments = []
    
    for res in results:
        total_usage += res["usage"]
        all_assignments.extend(res["assignments"])
        
    return all_assignments, total_usage

Overwriting src/distributed/map_reduce_ops.py


In [24]:
import pybind11
import sys

# Get the include paths for both pybind11 and Python itself
pybind_inc = pybind11.get_include()
python_inc = !python3-config --includes

# Join the list into a string
python_inc_str = " ".join(python_inc)

print("Compiling with paths...")
!g++ -O3 -Wall -shared -std=c++11 -fPIC -fopenmp \
    {python_inc_str} \
    -I{pybind_inc} \
    src/optimizer/cpp_core/optimizer.cpp \
    -o src/optimizer/cpp_core/fast_optimizer.so

import os
if os.path.exists("src/optimizer/cpp_core/fast_optimizer.so"):
    print("✅ Compilation Successful! .so file created.")

Compiling with paths...
✅ Compilation Successful! .so file created.


In [25]:
import pandas as pd
import numpy as np
import os

# 1. Reload the data from Phase 1
data_path = "/content/drive/MyDrive/OmniScale-Optimizer/data/processed/clean_data.csv"

if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    print(f"✅ Data loaded: {len(df)} rows.")
else:
    print("❌ Error: clean_data.csv not found in Drive. Did you run Phase 1?")

# 2. Re-calculate or load the warehouse locations (centroids)
# Since centroids were derived from the zones in Phase 1:
warehouse_locations = df.groupby('delivery_zone')[['lat', 'lon']].mean().values
num_warehouses = len(warehouse_locations)
total_orders = len(df)
capacity_per_warehouse = int((total_orders / num_warehouses) * 1.1) 
capacities = [capacity_per_warehouse] * num_warehouses

✅ Data loaded: 100000 rows.


In [26]:
from src.distributed.map_reduce_ops import run_distributed_optimizer
import time

# Ensure inputs are correctly typed for C++
warehouse_coords = warehouse_locations.astype(np.float64)
caps = np.array(capacities).astype(np.int32)

print("Starting Distributed MapReduce Simulation...")
start = time.time()

# We use 2 workers to match Colab's standard CPU count
dist_assignments, dist_usage = run_distributed_optimizer(
    df, 
    warehouse_coords, 
    caps, 
    num_workers=2
)

dist_time = time.time() - start
print(f"✅ Success! Distributed Execution Time: {dist_time:.4f} seconds")
print(f"Total Orders: {len(dist_assignments)}")

Starting Distributed MapReduce Simulation...
✅ Success! Distributed Execution Time: 0.0606 seconds
Total Orders: 100000
