In [27]:
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2 #install psycopg2-binary if linux
from sqlalchemy import create_engine

import faker
from tqdm import tqdm
from datetime import datetime

In [28]:
db_url = "postgresql+psycopg2://hermann:1234@156.67.29.207:5432/promodis_row_data"
engine = create_engine(db_url)

### Générer les données des produits

In [29]:
name_product = ['Bridel lait instant', 'lait tino en boite', 'Bridel lait uth', 'Ndolo', 
               'Tampico citrus', 'Tampico island', 'Cafe pele', 'Cafe coa energy',
               'Cremica butter', 'Cuetara', 'Lays Saled', 'Pringles BBQ', 'cornichons 700g',
               'cornichons 300 g', 'Extra abricot', 'Extra fraise']

cat_products = {
    "produits frais":{
        "produits laitiers" : ["lait en poudre", "lait liquide"],
        "boissons": ["jus", "boissons chaudes"]
    },
    "produit secs":{
        "biscuits": ["cookies", "croustillants"],
        "produits conserve": ["Cornichons", "confiture"]
    }
}

data = []
productID = 100

for category, sub_level1 in cat_products.items():
    for sub_cat1, sub_level2 in sub_level1.items():
        for sub_cat2 in sub_level2:
            for i in range(2):
                product = name_product.pop(0)
                data.append(
                    {
                        "productID": f"pr{productID}",
                        "category": category,
                        "subCategory1": sub_cat1,
                        "subCategory2": sub_cat2, 
                        "productName": product,
                        "unitPrice": random.randint(100, 5000)
                    }
                )

                productID+=1

df_product = pd.DataFrame(data)
df_product

Unnamed: 0,productID,category,subCategory1,subCategory2,productName,unitPrice
0,pr100,produits frais,produits laitiers,lait en poudre,Bridel lait instant,576
1,pr101,produits frais,produits laitiers,lait en poudre,lait tino en boite,4155
2,pr102,produits frais,produits laitiers,lait liquide,Bridel lait uth,1061
3,pr103,produits frais,produits laitiers,lait liquide,Ndolo,3502
4,pr104,produits frais,boissons,jus,Tampico citrus,3240
5,pr105,produits frais,boissons,jus,Tampico island,2639
6,pr106,produits frais,boissons,boissons chaudes,Cafe pele,4020
7,pr107,produits frais,boissons,boissons chaudes,Cafe coa energy,1861
8,pr108,produit secs,biscuits,cookies,Cremica butter,3397
9,pr109,produit secs,biscuits,cookies,Cuetara,822


In [30]:
df_product.to_csv("../Data/product.csv", index=False)

### Localisation

In [31]:
location_data = []

location = {
    "littoral": {
        "Douala" : ["Bonapriso", "Akwa", "Bonanjo", "Deïdo", "Bali", "Bépanda", "Makepe", "Ndogbong", "Bonamoussadi", "New-Bell"],
    },
    "Ouest":{
        "Bafoussam": ["bafoussam"],
        "Bafang": ["bafang"],
        "Foumban": ["foumban"],
        "Dschang": ["dschang"],
        "Bamenda": ["bamenda"],
        "Bansoa": ["bansoa"],
        "MB'DA": ["mb'da"],
    },
    "Centre":{
        "Yaoundé": ["Bastos", "Etoudi", "Ngousso", "Mvog-Mbi", "Emana", "Essos", "Melen", "Nlongkak", "Ngoa-Ekelle","Mokolo"],
        "Makenene": ["makenene"],
    },
} 

locationID = 100

for region, elm1 in location.items():
    for city, elm2 in elm1.items():
        for district in elm2:
            location_data.append({
                "locationID": f"lc{locationID}",
                "region": region,
                "city": city,
                "district": district
            })
            locationID+=1

df_location = pd.DataFrame(location_data)

df_location.to_csv("../Data/location.csv", index=False)

In [32]:
df_location.head()

Unnamed: 0,locationID,region,city,district
0,lc100,littoral,Douala,Bonapriso
1,lc101,littoral,Douala,Akwa
2,lc102,littoral,Douala,Bonanjo
3,lc103,littoral,Douala,Deïdo
4,lc104,littoral,Douala,Bali


In [33]:
fake = faker.Faker()

### Génération clients

In [34]:
client_data = []
location_id = list(df_location.locationID)

cat_client = ["DEPOT", "Prestige", "porte à porte", "Ecole", "Grossiste", "Divers"]
id=100

for i in tqdm(range(500)):
    clientID = f"cl{id}"
    client_name = fake.company()
    category = random.choices(cat_client, weights=[0.25, 0.35, 0.025, 0.1, 0.175, 0.1])[0]
    client_location = random.choice(location_id)

    id+=1
    client_data.append({
        "clientID": clientID,
        "clientName": client_name,
        "category": category,
        "location": client_location
    })

df_client = pd.DataFrame(client_data)
df_client.head()

df_client.to_csv("../Data/client.csv", index=False)

100%|██████████| 500/500 [00:00<00:00, 2148.37it/s]


### time

In [35]:
time_data = []
timeID=100
start_date="2022-01-01"
end_date="2024-12-31"
date_range = pd.date_range(start=start_date, end=end_date, freq="D")

for date in date_range:
    month = date.month
    year = date.year

    time_data.append({
        "DateID": f"tt{timeID}",
        "date": date,
        "month": month,
        "year": year
    })

    timeID+=1

df_time = pd.DataFrame(time_data)
df_time.head()


Unnamed: 0,DateID,date,month,year
0,tt100,2022-01-01,1,2022
1,tt101,2022-01-02,1,2022
2,tt102,2022-01-03,1,2022
3,tt103,2022-01-04,1,2022
4,tt104,2022-01-05,1,2022


In [36]:
df_time.to_csv("../Data/time.csv", index=False)

### Usine

In [37]:
usine = {
    "usineID": ["us100", "us101"],
    "usineName": ["usine_deido", "usine_logbessou"],
    "location": ["Douala, deido", "Douala, logbessou"]
}

df_usine = pd.DataFrame(usine)
df_usine.to_csv("../Data/usine.csv", index=False)

### consommation matière prémière

In [38]:
MP_data = []

MPID = 100
MPNames = ["ACIDE ASCORBIQUE", "BASE CITRUS PUNCH", "BASE ISLAND PUNCH", "BASE KIWI GUAVA", 
        "BASE MANGO PUNCH", "BASE POMME PUNCH", "BASE TROPICAL BERRY / FRUIT ROUGE",
        "XANTHAN GUM", "POTASSIUM BENZOATE", "POTASSIUM SORBATE", "SUCRE", "SACS DE LAIT SANS CORPS"]

for MPName in MPNames:
    MP_data.append({
        "MPID": f"MP{MPID}",
        "MPName": MPName
    })
    MPID += 1
df_MP = pd.DataFrame(MP_data)
df_MP.head()

Unnamed: 0,MPID,MPName
0,MP100,ACIDE ASCORBIQUE
1,MP101,BASE CITRUS PUNCH
2,MP102,BASE ISLAND PUNCH
3,MP103,BASE KIWI GUAVA
4,MP104,BASE MANGO PUNCH


In [39]:
df_MP.to_csv("../Data/MP.csv", index=False)

### ventes

In [40]:
vente_data = []

venteID = 100
product_id = list(df_product.productID)
client_id = list(df_client.clientID)
dates = list(df_time.date)

for i in tqdm(range(1000000)):
    date = random.choice(dates)
    salesVolume = random.randint(10, 200)
    productID = random.choice(product_id)
    clientID = random.choice(client_id)
    salesDate = random.choice(dates)
    salesAmount = df_product[df_product["productID"] == productID].unitPrice.iloc[0] * salesVolume
    locationID = df_client[df_client["clientID"] == clientID].location.iloc[0]

    vente_data.append({
        "venteID": venteID,
        "productID": productID,
        "clientID": clientID,
        "salesDate": salesDate,
        "salesVolume": salesVolume,
        "salesAmount": salesAmount,
        "locationID": locationID
    })
    venteID+=1

df_ventes = pd.DataFrame(vente_data)
df_ventes.head()

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [27:51<00:00, 598.35it/s] 


Unnamed: 0,venteID,productID,clientID,salesDate,salesVolume,salesAmount,locationID
0,100,pr106,cl452,2023-09-21,17537,70498740,lc100
1,101,pr103,cl131,2022-08-15,8061,28229622,lc127
2,102,pr103,cl388,2023-04-05,17909,62717318,lc121
3,103,pr110,cl525,2024-01-26,11208,43442208,lc112
4,104,pr106,cl502,2022-12-14,3693,14845860,lc104


In [41]:
df_ventes.to_csv("../Data/ventes.csv", index=False)

### distribution

In [51]:
distribution_data = []

distributionID = 100

for i in tqdm(range(10000)):
    date = random.choice(dates)
    productID = random.choice(product_id)
    clientID = random.choice(client_id)
    quantity = random.randint(1, 500)
    locationID = df_client[df_client["clientID"] == clientID].location.iloc[0]

    distribution_data.append({
        "distributionID" : distributionID,
        "productID": productID,
        "clientID": clientID,
        "distributionDate": date,
        "quantity": quantity,
        "locationID": locationID,
    })
    distributionID += 1

df_distribution = pd.DataFrame(distribution_data)
df_distribution.head()

100%|██████████| 10000/10000 [00:05<00:00, 1885.45it/s]


Unnamed: 0,distributionID,productID,clientID,distributionDate,quantity,locationID
0,100,pr110,cl385,2024-09-29,462,lc110
1,101,pr114,cl102,2023-06-10,148,lc118
2,102,pr101,cl345,2022-10-14,413,lc113
3,103,pr110,cl134,2022-05-05,370,lc121
4,104,pr106,cl538,2024-12-28,307,lc108


In [52]:
df_distribution.to_csv("../Data/distribution.csv", index=False)

### Fabrication

In [44]:
# Initialisation des listes de base
fabrication_data = []

fabricationID = 100
produit_id = list(df_product.productID)
usine_id = list(df_usine.usineID)  # Exemple d'ID pour les usines
mp_id = list(df_MP.MPID) # Exemple d'ID pour les matières premières
dates = list(df_time.date)

# Génération des données de fabrication
for i in tqdm(range(100000)):
    productID = random.choice(produit_id)
    usineID = random.choice(usine_id)
    MPID = random.choice(mp_id)
    date_production = random.choice(dates)
    
    # Quantité produite et conditionnée avec écarts de production et de conditionnement
    quantite_produite = random.randint(1000, 20000)
    quantite_conditionnee = random.randint(quantite_produite - 500, quantite_produite)  # Ex: conditionnement legèrement inferieur ou egal
    ecart_fabrication = quantite_produite - quantite_conditionnee  # Difference entre production et conditionnement
    ecart_conditionnement = random.randint(0, 100)  # ecart possible de conditionnement
    quantite_mp = random.randint(500, 15000)  # Quantite de matière première utilisee

    fabrication_data.append({
        "fabricationID": fabricationID,
        "productID": productID,
        "usineID": usineID,
        "MPID": MPID,
        "dateProduction": date_production,
        "quantiteProduite": quantite_produite,
        "quantiteConditionnee": quantite_conditionnee,
        "ecartFabrication": ecart_fabrication,
        "ecartConditionnement": ecart_conditionnement,
        "quantiteMP": quantite_mp
    })
    fabricationID += 1

# Création du DataFrame final
df_fabrication = pd.DataFrame(fabrication_data)
df_fabrication.head()


  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [00:00<00:00, 102998.71it/s]


Unnamed: 0,fabricationID,productID,usineID,MPID,dateProduction,quantiteProduite,quantiteConditionnee,ecartFabrication,ecartConditionnement,quantiteMP
0,100,pr113,us100,MP101,2023-04-11,9455,9044,411,31,3465
1,101,pr115,us100,MP101,2022-12-27,15440,15128,312,94,1996
2,102,pr103,us101,MP105,2023-02-01,6838,6532,306,71,11839
3,103,pr108,us100,MP106,2023-02-19,2024,1554,470,38,14308
4,104,pr113,us101,MP106,2024-12-13,12626,12482,144,77,899


In [45]:
df_fabrication.to_csv("../Data/fabrication.csv", index=False)

In [58]:
retour_data = []

retourID = 100
client_ids = list(df_client.clientID)  # Exemples de localisations
dates = list(df_time.date)  # Assure que `df_temps` contient un DataFrame avec une colonne 'date'

# Génération des données de retour
for i in tqdm(range(50000)):
    productID = random.choice(product_id)
    usineID = random.choice(usine_id)
    clientID = random.choice(client_ids)
    date_retour = random.choice(dates)
    quantite_retournee = random.randint(1, 1000)  # Quantite de produit retournee

    retour_data.append({
        "retourID": retourID,
        "productID": productID,
        "usineID": usineID,
        "dateRetour": date_retour,
        "quantiteRetournée": quantite_retournee,
    })
    retourID += 1

# Creation du DataFrame final
df_retours = pd.DataFrame(retour_data)
df_retours.head()

100%|██████████| 50000/50000 [00:00<00:00, 160082.07it/s]


Unnamed: 0,retourID,productID,usineID,dateRetour,quantiteRetournée
0,100,pr103,us100,2024-08-12,709
1,101,pr102,us100,2023-12-21,129
2,102,pr102,us101,2022-05-12,366
3,103,pr102,us100,2023-05-15,147
4,104,pr101,us101,2023-02-12,329


In [59]:
df_retours.to_csv("../Data/retours.csv", index=False)

In [48]:
df_ventes.isna().sum()

venteID        0
productID      0
clientID       0
salesDate      0
salesVolume    0
salesAmount    0
locationID     0
dtype: int64

In [53]:
df_ventes.head()

Unnamed: 0,venteID,productID,clientID,salesDate,salesVolume,salesAmount,locationID
0,100,pr106,cl452,2023-09-21,17537,70498740,lc100
1,101,pr103,cl131,2022-08-15,8061,28229622,lc127
2,102,pr103,cl388,2023-04-05,17909,62717318,lc121
3,103,pr110,cl525,2024-01-26,11208,43442208,lc112
4,104,pr106,cl502,2022-12-14,3693,14845860,lc104


In [54]:
df_MP.head()

Unnamed: 0,MPID,MPName
0,MP100,ACIDE ASCORBIQUE
1,MP101,BASE CITRUS PUNCH
2,MP102,BASE ISLAND PUNCH
3,MP103,BASE KIWI GUAVA
4,MP104,BASE MANGO PUNCH


In [55]:
df_fabrication.head()

Unnamed: 0,fabricationID,productID,usineID,MPID,dateProduction,quantiteProduite,quantiteConditionnee,ecartFabrication,ecartConditionnement,quantiteMP
0,100,pr113,us100,MP101,2023-04-11,9455,9044,411,31,3465
1,101,pr115,us100,MP101,2022-12-27,15440,15128,312,94,1996
2,102,pr103,us101,MP105,2023-02-01,6838,6532,306,71,11839
3,103,pr108,us100,MP106,2023-02-19,2024,1554,470,38,14308
4,104,pr113,us101,MP106,2024-12-13,12626,12482,144,77,899


In [60]:
df_retours.tail()

Unnamed: 0,retourID,productID,usineID,dateRetour,quantiteRetournée
49995,50095,pr113,us101,2024-11-12,500
49996,50096,pr100,us101,2022-05-10,245
49997,50097,pr110,us101,2024-04-08,259
49998,50098,pr103,us100,2022-05-27,498
49999,50099,pr106,us101,2022-10-28,309
