In [24]:
# !pip install pandas faker

In [3]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import datetime

fake = Faker()

# Number of records to generate
num_records = 500

# --- Sales & NPD Data ---
sales_data = {
    'SalesID': [fake.uuid4() for _ in range(num_records)],
    'CustomerID': [fake.uuid4() for _ in range(num_records)],
    'ProductID': [f'P{random.randint(100, 999)}' for _ in range(num_records)],
    'ProductName': [fake.word().capitalize() + " " + fake.word().capitalize() for _ in range(num_records)],
    'QuotationID': [f'Q{random.randint(1000, 9999)}' if random.random() < 0.8 else None for _ in range(num_records)],
    'SalesOrderDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'InvoiceID': [f'INV{random.randint(10000, 99999)}' for _ in range(num_records)],
    'InvoiceDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'DispatchDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'Quantity': [random.randint(1, 100) for _ in range(num_records)],
    'UnitPrice': [round(random.uniform(10, 1000), 2) for _ in range(num_records)],
    'NPD_Stage': [random.choice(['Idea', 'Prototype', 'Testing', 'Launch', None]) for _ in range(num_records)],
    'NPD_LaunchDate': [fake.date_between(start_date='-1y', end_date='today') if random.random() < 0.2 else None for _ in range(num_records)],
}
sales_df = pd.DataFrame(sales_data)
sales_df.head()

Unnamed: 0,SalesID,CustomerID,ProductID,ProductName,QuotationID,SalesOrderDate,InvoiceID,InvoiceDate,DispatchDate,Quantity,UnitPrice,NPD_Stage,NPD_LaunchDate
0,d8fe705e-6958-40b8-975e-72cacfd5fd34,9e80d4eb-f4f8-4025-bd86-4e4a2ffc18b5,P458,Road Son,Q7428,2024-03-28,INV92139,2024-03-23,2025-03-02,13,965.27,Prototype,
1,fdeff2b2-c963-42c1-b810-46ca2bd16cea,6750d0e7-b6fd-4eab-bf88-69b46b53ca91,P897,Relationship Human,Q6866,2024-12-21,INV41085,2025-03-18,2024-05-13,56,642.29,,
2,493cace6-8556-4c77-9bbd-81c43025efb0,577db725-8c36-4729-b111-602fc1d7ab3c,P650,Someone Star,Q8393,2025-01-31,INV82094,2024-07-06,2024-06-04,33,816.87,,
3,ac6f6b66-083d-4638-8ee7-28955ead3fde,004043b2-32be-449d-889e-fd59fa55cee2,P719,Choose Rich,Q7698,2024-09-26,INV55715,2024-03-25,2024-09-11,33,843.52,,
4,1485f54d-ccf0-4b37-ba4c-efd154597afc,c5064f3c-f484-46d4-9872-5531725b8490,P602,Use Rule,Q4960,2024-06-16,INV31132,2025-02-06,2025-02-04,62,570.48,Testing,


In [4]:
# --- Planning Data ---
planning_data = {
    'PlanID': [fake.uuid4() for _ in range(num_records)],
    'ProductID': [f'P{random.randint(100, 999)}' for _ in range(num_records)],
    'ForecastDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'ForecastQuantity': [random.randint(50, 500) for _ in range(num_records)],
    'MaterialPlanDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'ProductionScheduleDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
}
planning_df = pd.DataFrame(planning_data)
planning_df.head()

Unnamed: 0,PlanID,ProductID,ForecastDate,ForecastQuantity,MaterialPlanDate,ProductionScheduleDate
0,81ae78f0-4a43-4078-8436-cc4a693ba274,P796,2024-10-31,308,2025-01-08,2024-11-05
1,249513c0-c9d8-47af-8311-bf4e889ac0e6,P181,2024-04-12,127,2025-03-07,2024-04-11
2,f30e81d5-79aa-472e-920b-41abcbce6e72,P606,2025-03-16,273,2025-03-08,2024-05-18
3,7ebf2ac6-d29a-466b-aeaa-b7534e7dc8ea,P825,2025-01-03,175,2024-06-24,2024-04-18
4,c767cbf6-6257-491a-88fd-35517d8a4460,P996,2024-12-23,100,2024-10-16,2024-09-11


In [7]:
# set(sales_df['ProductID']).intersection(planning_df['ProductID'])

In [5]:
# --- Purchase Data ---
purchase_data = {
    'PurchaseOrderID': [fake.uuid4() for _ in range(num_records)],
    'SupplierID': [fake.uuid4() for _ in range(num_records)],
    'MaterialID': [f'M{random.randint(100, 999)}' for _ in range(num_records)],
    'PurchaseOrderDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'Quantity': [random.randint(100, 1000) for _ in range(num_records)],
    'UnitPrice': [round(random.uniform(5, 500), 2) for _ in range(num_records)],
}
purchase_df = pd.DataFrame(purchase_data)
purchase_df.head()

Unnamed: 0,PurchaseOrderID,SupplierID,MaterialID,PurchaseOrderDate,Quantity,UnitPrice
0,3c579356-27c6-4ec7-bbd0-d8d31d3d8e1f,659d3eec-13f7-46a8-9f9d-dc2ea5c4165e,M980,2025-03-21,507,327.48
1,427f0e65-09a7-4712-bb3e-9101a12a61e9,5a9ba53b-fcbb-4e46-9eff-ee1357e708d9,M198,2024-06-27,987,415.4
2,9f27f907-475b-4009-b98c-af1ad2983246,25a5ad4d-1cf3-483a-a2ff-9e86e01b4803,M893,2025-03-07,480,49.98
3,1656fe13-7a10-467e-a1aa-2ea122b18822,5a690d03-ae2e-489f-a262-ef1fb19810b5,M576,2024-08-05,260,489.33
4,744e91cb-5369-4e16-9d5f-f447fbd0b61b,38cb0c1d-feef-4fc6-9471-ea1f4b0824c3,M390,2024-10-14,705,391.84


In [8]:
# --- Stores Data ---
stores_data = {
    'StoreID': [fake.uuid4() for _ in range(num_records)],
    'MaterialID': [f'M{random.randint(100, 999)}' for _ in range(num_records)],
    'StockLevel': [random.randint(0, 5000) for _ in range(num_records)],
    'MaterialMovementDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'MovementType': [random.choice(['In', 'Out']) for _ in range(num_records)],
    'MovementQuantity': [random.randint(1, 1000) for _ in range(num_records)],
}
stores_df = pd.DataFrame(stores_data)
stores_df.head()

Unnamed: 0,StoreID,MaterialID,StockLevel,MaterialMovementDate,MovementType,MovementQuantity
0,c5b9be5d-bc10-4262-9708-42a3fa5f2e4f,M780,3354,2024-10-12,Out,294
1,8bdae318-344c-4fd1-b610-72aa200e882c,M138,1131,2024-10-23,In,467
2,40c82d1a-d72c-4680-900f-61b29b1c7492,M159,277,2024-09-01,In,769
3,6ad64c48-8740-4623-91a4-82f652c33d4c,M865,130,2024-06-06,Out,398
4,e70679f4-8302-4939-92ed-b0982e685ef3,M673,807,2024-05-22,In,236


In [10]:
# set(purchase_df['MaterialID']).intersection(set(stores_df['MaterialID']))

In [11]:
# --- Production Data ---
production_data = {
    'ProductionID': [fake.uuid4() for _ in range(num_records)],
    'ProductID': [f'P{random.randint(100, 999)}' for _ in range(num_records)],
    'MaterialID': [f'M{random.randint(100, 999)}' for _ in range(num_records)],
    'ProductionDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'JobWorkID': [f'JW{random.randint(1000, 9999)}' if random.random() < 0.5 else None for _ in range(num_records)],
    'RawMaterialConsumption': [random.randint(1, 50) for _ in range(num_records)],
    'ProductionQuantity': [random.randint(1, 200) for _ in range(num_records)]
}
production_df = pd.DataFrame(production_data)
production_df.head()

Unnamed: 0,ProductionID,ProductID,MaterialID,ProductionDate,JobWorkID,RawMaterialConsumption,ProductionQuantity
0,a2dddb54-9f5d-4d4c-bccf-1cd129f422bc,P536,M382,2024-03-27,,10,174
1,91ed0191-9dbe-4742-ab2f-f3b499627f2c,P809,M407,2025-02-21,,40,157
2,54bec732-e8f6-4d87-8acc-adde12e66ca5,P683,M494,2024-12-09,,16,4
3,83145984-2888-4680-972a-ec3b470534ed,P702,M494,2024-05-08,,40,135
4,5cadcab6-4cf3-4044-95de-1056bb934833,P715,M961,2024-05-30,JW3624,41,17


In [14]:
# --- Maintenance Data ---
maintenance_data = {
    'MaintenanceID': [fake.uuid4() for _ in range(num_records)],
    'MachineID': [f'MC{random.randint(100, 999)}' for _ in range(num_records)],
    'MaintenanceDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'MaintenanceType': [random.choice(['Preventive', 'Corrective']) for _ in range(num_records)],
    'MaintenanceCost': [round(random.uniform(100, 5000), 2) for _ in range(num_records)],
}
maintenance_df = pd.DataFrame(maintenance_data)
maintenance_df.head()

Unnamed: 0,MaintenanceID,MachineID,MaintenanceDate,MaintenanceType,MaintenanceCost
0,90fe7a39-4362-45be-b847-8c175d0300f0,MC446,2024-05-07,Corrective,1746.11
1,742edd0d-6634-4810-a6b0-ae9345f0ebcf,MC222,2024-07-09,Preventive,1271.88
2,533d1980-64d7-405d-b4a2-bb75b05b4127,MC563,2024-11-12,Preventive,4866.47
3,6f600da4-f580-47b6-8ac2-8abd6f808c7b,MC546,2024-06-09,Preventive,4912.82
4,beefccbb-904e-406a-9fe3-0807f24e9849,MC404,2024-08-03,Corrective,2128.16


In [15]:
# --- Quality Data ---
quality_data = {
    'QualityID': [fake.uuid4() for _ in range(num_records)],
    'MaterialID': [f'M{random.randint(100, 999)}' for _ in range(num_records)],
    'InspectionDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'InspectionResult': [random.choice(['Pass', 'Fail']) for _ in range(num_records)],
    'ValidationDate': [fake.date_between(start_date='-1y', end_date='today') if random.random() < 0.7 else None for _ in range(num_records)]
}
quality_df = pd.DataFrame(quality_data)
quality_df.head()

Unnamed: 0,QualityID,MaterialID,InspectionDate,InspectionResult,ValidationDate
0,3b22f9a7-aa2f-4bbe-96e7-50a4f5f5c221,M346,2024-05-18,Fail,2024-10-16
1,52aa6c25-b04a-4153-85b3-501df6f9fc09,M892,2024-08-09,Pass,2025-02-04
2,2cf571ef-a872-4a84-9bc7-abf9e2150a7f,M850,2024-09-08,Pass,2024-12-23
3,fc377327-8f07-45fd-a1ea-f0b975e358e8,M724,2024-12-22,Pass,
4,8862bb93-db37-47dd-9034-bcf68da9af51,M887,2024-10-14,Fail,2024-06-21


In [16]:
# --- Dispatch & Logistics Data ---
dispatch_data = {
    'DispatchID': [fake.uuid4() for _ in range(num_records)],
    'SalesID': [sales_df['SalesID'].sample(n=1).iloc[0] for _ in range(num_records)],
    'TransportPartner': [fake.company() for _ in range(num_records)],
    'DeliveryDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'TrackingNumber': [f'TN{random.randint(100000, 999999)}' for _ in range(num_records)],
}
dispatch_df = pd.DataFrame(dispatch_data)
dispatch_df.head()

Unnamed: 0,DispatchID,SalesID,TransportPartner,DeliveryDate,TrackingNumber
0,2cb9467b-783e-458c-af34-1fee926bb699,4adbe0f8-e7fd-48df-a4f2-28b57e278cb1,Mccall PLC,2025-03-05,TN730052
1,0555c221-48be-4738-8a1c-216ad3101eb3,dc515de9-70ae-4c47-b80b-d5a979f5e184,Klein LLC,2024-11-07,TN140185
2,67dafed7-e88e-4caa-92be-44926239176d,24578744-7fbd-46df-a598-ade630b628e1,Lawrence-Stafford,2025-03-10,TN171248
3,2aec14c8-7ada-4a82-88cd-ea22fab79315,f22a5964-968c-442b-9d08-7c99816b7bb4,Flores Inc,2025-03-09,TN574454
4,f5b0ab73-e240-4a71-9b56-2ea1481b1ae8,ccb18f8c-6c25-4412-b944-96310b852d77,Mcfarland Inc,2024-05-16,TN450387


In [17]:
# --- HR & Admin Data ---
hr_data = {
    'EmployeeID': [fake.uuid4() for _ in range(num_records)],
    'EmployeeName': [fake.name() for _ in range(num_records)],
    'HireDate': [fake.date_between(start_date='-5y', end_date='-1y') for _ in range(num_records)],
    'Department': [random.choice(['Sales', 'Planning', 'Purchase', 'Stores', 'Production', 'Maintenance', 'Quality', 'Logistics', 'HR', 'Finance']) for _ in range(num_records)],
    'Salary': [round(random.uniform(30000, 150000), 2) for _ in range(num_records)],
    'PayrollDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
}
hr_df = pd.DataFrame(hr_data)
hr_df.head()

Unnamed: 0,EmployeeID,EmployeeName,HireDate,Department,Salary,PayrollDate
0,c0da5043-2270-4891-aec1-3a61dbe50d1f,Tammy Taylor,2023-09-25,Stores,147740.91,2024-12-02
1,29ec23ed-1d70-44c5-aeca-d06e775d14ea,Jennifer Skinner,2020-07-12,Purchase,123698.06,2024-11-06
2,bbc5ef78-c245-477e-9cf6-48c80b54ea5a,Emily Hart,2023-07-14,Stores,41618.35,2025-02-22
3,e2a61566-bc2f-4d44-830d-d3e0ad721a38,Katherine Riddle,2024-03-11,Finance,67994.6,2024-08-18
4,89c7bf24-d74e-45d0-a45c-9c6298175483,Tonya Pham,2022-08-31,Planning,123306.58,2024-07-14


In [18]:
# --- Accounts & Finance Data ---
finance_data = {
    'TransactionID': [fake.uuid4() for _ in range(num_records)],
    'AccountID': [fake.uuid4() for _ in range(num_records)],
    'TransactionDate': [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_records)],
    'TransactionType': [random.choice(['Credit', 'Debit']) for _ in range(num_records)],
    'Amount': [round(random.uniform(500, 100000), 2) for _ in range(num_records)],
    'GST_Compliance': [random.choice([True, False]) for _ in range(num_records)],
}
finance_df = pd.DataFrame(finance_data)
finance_df.head()

Unnamed: 0,TransactionID,AccountID,TransactionDate,TransactionType,Amount,GST_Compliance
0,6c9be132-6468-4245-b755-c97fb8dbc5c5,45d6f1cb-f9e7-41ad-b200-d4e252c894da,2024-06-02,Debit,70388.99,False
1,5c76db1d-422c-4d16-a5c3-63a4387e6b20,189ec596-4219-4080-be06-2c5b6500217a,2024-09-29,Debit,63972.29,False
2,0f8f263d-1768-402b-be2f-85dfb0866826,7b3ef9cd-b48b-49b2-9722-11582463e4b9,2024-07-08,Debit,20186.75,True
3,f91aa09c-f0c3-4b48-a661-b8d16b5d5c43,e23b32a1-3a0c-49dc-b940-beb78c5d9f83,2024-06-11,Credit,51241.44,True
4,838515ef-a09c-4ff7-9051-fd8c40ce19e5,d895b7fc-eaee-4867-9433-a3c4bf24504d,2024-08-13,Credit,4265.19,False


In [19]:
# --- Settings Data ---
settings_data = {
    'SettingID': [fake.uuid4() for _ in range(10)],
    'SettingName': [random.choice(['Tax Rate', 'Discount Policy', 'Currency', 'Payment Terms', 'Return Policy']) for _ in range(10)],
    'SettingValue': [random.choice(['5%', '10%', 'USD', 'Net 30', 'No Returns']) for _ in range(10)],
    'LastUpdated': [fake.date_between(start_date='-1y', end_date='today') for _ in range(10)],
}
settings_df = pd.DataFrame(settings_data)
settings_df.head()

Unnamed: 0,SettingID,SettingName,SettingValue,LastUpdated
0,fc2268ac-c3d7-43ad-a8e6-fd622f28790e,Payment Terms,No Returns,2024-11-19
1,92f197d8-800c-4dc0-a646-510043f14c89,Payment Terms,Net 30,2024-04-13
2,e89d6bd2-46b0-4461-a91f-218c928dd009,Discount Policy,10%,2024-10-05
3,6b190a14-cd93-45ed-86b0-df5bc62b9ba4,Discount Policy,No Returns,2024-10-07
4,8d5519c1-acd7-4204-ab05-975c72794714,Return Policy,USD,2025-03-04


In [22]:
# Number of employees
num_employees = 100

# Generate employee data
employee_data = {
    'EmployeeID': [fake.uuid4() for _ in range(num_employees)],
    'EmployeeName': [fake.name() for _ in range(num_employees)],
    'Department': [random.choice(['Sales', 'Planning', 'Purchase', 'Stores', 'Production', 'Maintenance', 'Quality', 'Dispatch', 'HR', 'Accounts', 'Settings']) for _ in range(num_employees)],
    'JobTitle': [fake.job() for _ in range(num_employees)],
    'AccessLevel': [random.choice(['Full', 'Limited', 'None']) for _ in range(num_employees)],
    'SalesAccess': [random.choice([True, False]) if random.choice(['Sales', 'Accounts']) == random.choice(['Sales', 'Accounts']) else False for _ in range(num_employees)],
    'PlanningAccess': [random.choice([True, False]) if random.choice(['Planning', 'Accounts']) == random.choice(['Planning', 'Accounts']) else False for _ in range(num_employees)],
    'PurchaseAccess': [random.choice([True, False]) if random.choice(['Purchase', 'Accounts']) == random.choice(['Purchase', 'Accounts']) else False for _ in range(num_employees)],
    'StoresAccess': [random.choice([True, False]) if random.choice(['Stores', 'Production']) == random.choice(['Stores', 'Production']) else False for _ in range(num_employees)],
    'ProductionAccess': [random.choice([True, False]) if random.choice(['Production', 'Stores']) == random.choice(['Production', 'Stores']) else False for _ in range(num_employees)],
    'MaintenanceAccess': [random.choice([True, False]) if random.choice(['Maintenance', 'Quality']) == random.choice(['Maintenance', 'Quality']) else False for _ in range(num_employees)],
    'QualityAccess': [random.choice([True, False]) if random.choice(['Quality', 'Maintenance']) == random.choice(['Quality', 'Maintenance']) else False for _ in range(num_employees)],
    'DispatchAccess': [random.choice([True, False]) if random.choice(['Dispatch', 'Sales']) == random.choice(['Dispatch', 'Sales']) else False for _ in range(num_employees)],
    'HRAccess': [random.choice([True, False]) if random.choice(['HR', 'Accounts', 'Settings']) == random.choice(['HR', 'Accounts', 'Settings']) else False for _ in range(num_employees)],
    'AccountsAccess': [random.choice([True, False]) if random.choice(['Accounts', 'Settings']) == random.choice(['Accounts', 'Settings']) else False for _ in range(num_employees)],
    'SettingsAccess': [random.choice([True, False]) if random.choice(['Settings', 'Accounts']) == random.choice(['Settings', 'Accounts']) else False for _ in range(num_employees)],
}

employee_df = pd.DataFrame(employee_data)
employee_df.head()

Unnamed: 0,EmployeeID,EmployeeName,Department,JobTitle,AccessLevel,SalesAccess,PlanningAccess,PurchaseAccess,StoresAccess,ProductionAccess,MaintenanceAccess,QualityAccess,DispatchAccess,HRAccess,AccountsAccess,SettingsAccess
0,af2af254-7906-4bcc-9f48-017d0272565e,Richard Davis,Accounts,"Programmer, systems",,True,True,False,False,False,False,False,False,True,True,True
1,1902f73b-5030-4843-b2c2-7222e6937975,Bianca Garrett,Settings,Physiological scientist,Full,False,False,False,False,True,False,True,False,False,False,True
2,e6215472-f44d-408f-843f-cecc24286407,Melinda Thomas,Dispatch,"Librarian, academic",,False,False,True,False,False,False,False,False,False,False,False
3,d629dacb-dccc-4b4a-b310-b2ce96ae03cc,Kenneth Lowe V,Planning,"Doctor, general practice",Full,False,False,False,False,True,False,False,False,False,True,True
4,a3237c33-229e-4d4b-9fe8-8b2f3f2b3d04,Jessica Johnson,Sales,Personal assistant,Limited,False,False,False,False,True,False,False,False,False,False,False


In [None]:
# Save to CSV files
sales_df.to_csv('../db/sales_data.csv', index=False)
planning_df.to_csv('../db/planning_data.csv', index=False)
purchase_df.to_csv('../db/purchase_data.csv', index=False)
stores_df.to_csv('../db/stores_data.csv', index=False)
production_df.to_csv('../db/production_data.csv', index=False)
maintenance_df.to_csv('../db/maintenance_data.csv', index=False)
quality_df.to_csv('../db/quality_data.csv', index=False)
dispatch_df.to_csv('../db/dispatch_data.csv', index=False)
hr_df.to_csv('../db/hr_data.csv', index=False)
finance_df.to_csv('../db/finance_data.csv', index=False)
settings_df.to_csv('../db/settings_data.csv', index=False)
employee_df.to_csv('../db/employees_data.csv', index=False)

print("Sample data generated and saved successfully.")

Sample data generated and saved successfully.


In [None]:
import os
import glob
import pymongo
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

# Connect to MongoDB
client = pymongo.MongoClient(os.environ['MONGO_URI'])
db = client["hack_ai_thon"]

# List of CSV files and their corresponding MongoDB collections
csv_files = {
    "sales_data.csv": "Sales_NPD",
    "planning_data.csv": "Planning",
    "purchase_data.csv": "Purchase",
    "stores_data.csv": "Stores",
    "production_data.csv": "Production",
    "maintenance_data.csv": "Maintenance",
    "quality_data.csv": "Quality",
    "dispatch_data.csv": "Dispatch_Logistics",
    "hr_data.csv": "HR_Admin",
    "finance_data.csv": "Accounts_Finance",
    "settings_data.csv": "Settings",
    "employees_data.csv" : "Employees"
}

In [None]:
# Function to upload CSV data to MongoDB
def upload_to_mongodb(csv_filename, collection_name):
    collection = db[collection_name] 
    df = pd.read_csv(csv_filename)
    data = df.to_dict(orient="records")
    collection.insert_many(data)
    print(f"Uploaded {len(data)} records to {collection_name}")

for path in glob.glob("db/*.csv"):
    upload_to_mongodb(path, csv_files[path.split('\\')[-1]])
print("All data uploaded successfully to MongoDB.")

Uploaded 500 records to Dispatch_Logistics
Uploaded 500 records to Accounts_Finance
Uploaded 500 records to HR_Admin
Uploaded 500 records to Maintenance
Uploaded 500 records to Planning
Uploaded 500 records to Production
Uploaded 500 records to Purchase
Uploaded 500 records to Quality
Uploaded 500 records to Sales_NPD
Uploaded 10 records to Settings
Uploaded 500 records to Stores
All data uploaded successfully to MongoDB.


In [None]:
# !pip install nltk pdfplumber faiss-cpu sentence-transformers

In [18]:
import re
import faiss
import pdfplumber
from nltk.stem import WordNetLemmatizer
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.docstore import InMemoryDocstore
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from langchain.text_splitter import SentenceTransformersTokenTextSplitter

model = SentenceTransformer('all-MiniLM-L6-v2')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)


def gen_embeddings(texts):
    return model.encode(texts)

In [20]:
pdf = pdfplumber.open("../db/Sample-Code-of-Conduct-for-Small-and-Medium-Enterprises.pdf")

text = ""
for page in pdf.pages:
    text += page.extract_text() + "\n"

text_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=50, tokens_per_chunk=384)
text_chunks = text_splitter.split_text(text)

embeddings = gen_embeddings(text_chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

documents = [
    Document(page_content=text)
    for text in text_chunks
]
docstore_dict = {str(idx): doc for idx, doc in enumerate(documents)}
docstore = InMemoryDocstore(docstore_dict)
index_to_docstore_id = {i: str(i) for i in range(len(documents))}

vector_store = FAISS(embedding_function=gen_embeddings, index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id)
retriever = vector_store.as_retriever()

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [25]:
import pandas as pd

employee_df = pd.read_csv("../db/employees_data.csv")
employee_df.head()

Unnamed: 0,EmployeeID,EmployeeName,Department,JobTitle,AccessLevel,SalesAccess,PlanningAccess,PurchaseAccess,StoresAccess,ProductionAccess,MaintenanceAccess,QualityAccess,DispatchAccess,HRAccess,AccountsAccess,SettingsAccess
0,af2af254-7906-4bcc-9f48-017d0272565e,Richard Davis,Accounts,"Programmer, systems",,True,True,False,False,False,False,False,False,True,True,True
1,1902f73b-5030-4843-b2c2-7222e6937975,Bianca Garrett,Settings,Physiological scientist,Full,False,False,False,False,True,False,True,False,False,False,True
2,e6215472-f44d-408f-843f-cecc24286407,Melinda Thomas,Dispatch,"Librarian, academic",,False,False,True,False,False,False,False,False,False,False,False
3,d629dacb-dccc-4b4a-b310-b2ce96ae03cc,Kenneth Lowe V,Planning,"Doctor, general practice",Full,False,False,False,False,True,False,False,False,False,True,True
4,a3237c33-229e-4d4b-9fe8-8b2f3f2b3d04,Jessica Johnson,Sales,Personal assistant,Limited,False,False,False,False,True,False,False,False,False,False,False


In [26]:
import random

employees_split = employee_df['EmployeeName'].str.split()
emails = [".".join(split[:2]).lower() + "@gmail.com" for split in employees_split]
passwords = [str(random.randrange(100000, 999999)) for _ in range(len(employee_df))]
employee_df['EmployeeEmail'] = emails
employee_df['EmployeePassword'] = passwords

In [27]:
employee_df.head()

Unnamed: 0,EmployeeID,EmployeeName,Department,JobTitle,AccessLevel,SalesAccess,PlanningAccess,PurchaseAccess,StoresAccess,ProductionAccess,MaintenanceAccess,QualityAccess,DispatchAccess,HRAccess,AccountsAccess,SettingsAccess,EmployeeEmail,EmployeePassword
0,af2af254-7906-4bcc-9f48-017d0272565e,Richard Davis,Accounts,"Programmer, systems",,True,True,False,False,False,False,False,False,True,True,True,richard.davis@gmail.com,534199
1,1902f73b-5030-4843-b2c2-7222e6937975,Bianca Garrett,Settings,Physiological scientist,Full,False,False,False,False,True,False,True,False,False,False,True,bianca.garrett@gmail.com,251575
2,e6215472-f44d-408f-843f-cecc24286407,Melinda Thomas,Dispatch,"Librarian, academic",,False,False,True,False,False,False,False,False,False,False,False,melinda.thomas@gmail.com,442853
3,d629dacb-dccc-4b4a-b310-b2ce96ae03cc,Kenneth Lowe V,Planning,"Doctor, general practice",Full,False,False,False,False,True,False,False,False,False,True,True,kenneth.lowe@gmail.com,487852
4,a3237c33-229e-4d4b-9fe8-8b2f3f2b3d04,Jessica Johnson,Sales,Personal assistant,Limited,False,False,False,False,True,False,False,False,False,False,False,jessica.johnson@gmail.com,649565


In [28]:
employee_df.to_csv("../db/employees_data_with_password.csv", index=False)

In [12]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv()

mongo_client = MongoClient(os.environ.get("MONGO_URI"))
mongo_db = mongo_client['hack_ai_thon']
collection = mongo_db['Employees']

In [30]:
import requests

BASE_URL = "http://127.0.0.1:5000"

def create_user(user):
    url = f"{BASE_URL}/sign-up"
    response = requests.post(url, json=user)
    if response.status_code == 200:
        return response.json()
    else:
        return {"error": f"Failed to create user: {response.text}"}


employee_df.fillna(value='None', inplace=True)
for idx, emp in employee_df.iterrows():
    user_response = create_user(emp.to_dict())
    print(user_response)

{'data': {'session': {'created_at': '2025-03-23 04:09:35.439103', 'expires_at': '2025-03-24 04:09:35.439103', 'session_token': '819d585c01ff6a2da056ac45d7c774fff55dd93ce0cb05ecfc18e264c42dd1c8'}}, 'message': 'Session created successfully'}
{'data': {'session': {'created_at': '2025-03-23 04:09:35.712172', 'expires_at': '2025-03-24 04:09:35.712172', 'session_token': '086fe49ec76f985d42f18e7c79197fbb46f9bc9d3cb7bf7bb08c37cf71f17ed7'}}, 'message': 'Session created successfully'}
{'data': {'session': {'created_at': '2025-03-23 04:09:35.980274', 'expires_at': '2025-03-24 04:09:35.980274', 'session_token': 'ce3711ad88ab528ff787614a7eede1a93026bff06201af30684c05fbf4eee37b'}}, 'message': 'Session created successfully'}
{'data': {'session': {'created_at': '2025-03-23 04:09:36.261296', 'expires_at': '2025-03-24 04:09:36.261296', 'session_token': '0309606ddcbf546af15587dd668c0c33253400d6ba35c2d6ed20bca4e0507411'}}, 'message': 'Session created successfully'}
{'data': {'session': {'created_at': '202