In [11]:
import pandas as pd
import os
import subprocess
import tqdm

In [9]:
def preprocess_folder(output_filepath):
    if os.path.isdir(output_filepath):
        for file in os.listdir(output_filepath):
            os.remove(os.path.join(output_filepath, file))
        print('[INFO] Directory was cleaned successfully')
    else:
        raise FileNotFoundError(f"Папка {output_filepath} не существует.")


def read_csv_in_batches(file_path, batch_size=100_000):
    # Чтение CSV файла с использованием библиотеки pandas
    df = pd.read_csv(file_path, iterator=True, chunksize=batch_size, low_memory=False)
    
    # Итерация по пакетам данных
    batches = []
    for batch in df:
        batches.append(batch)
    
    print(f'[INFO] Batches was created. Total number of butches: {len(batches)}')
    return batches


def save_batches_to_csv(batches, output_folder='output'):
    # Создаем папку для сохранения файлов, если её нет
    os.makedirs(output_folder, exist_ok=True)

    # Итерация по пакетам данных
    for i, batch in enumerate(batches):
        output_file = os.path.join(output_folder, f'batch_{i + 1}.csv')
        batch.to_csv(output_file, index=False)
        print(f'[INFO] Batch #{i + 1} was saved to {output_file}') 

In [8]:
# Пример использования
# file_path = 'taxi.csv'
file_path = 'taxi_with_trips.csv'
output_filepath = 'data/'
preprocess_folder(output_filepath)
result_batches = read_csv_in_batches(file_path)

[INFO] Directory was cleaned successfully
[INFO] Batches was created. Total number of butches: 50


In [10]:
save_batches_to_csv(result_batches, output_folder='data/')

[INFO] Batch #1 was saved to data/batch_1.csv
[INFO] Batch #2 was saved to data/batch_2.csv
[INFO] Batch #3 was saved to data/batch_3.csv
[INFO] Batch #4 was saved to data/batch_4.csv
[INFO] Batch #5 was saved to data/batch_5.csv
[INFO] Batch #6 was saved to data/batch_6.csv
[INFO] Batch #7 was saved to data/batch_7.csv
[INFO] Batch #8 was saved to data/batch_8.csv
[INFO] Batch #9 was saved to data/batch_9.csv
[INFO] Batch #10 was saved to data/batch_10.csv
[INFO] Batch #11 was saved to data/batch_11.csv
[INFO] Batch #12 was saved to data/batch_12.csv
[INFO] Batch #13 was saved to data/batch_13.csv
[INFO] Batch #14 was saved to data/batch_14.csv
[INFO] Batch #15 was saved to data/batch_15.csv
[INFO] Batch #16 was saved to data/batch_16.csv
[INFO] Batch #17 was saved to data/batch_17.csv
[INFO] Batch #18 was saved to data/batch_18.csv
[INFO] Batch #19 was saved to data/batch_19.csv
[INFO] Batch #20 was saved to data/batch_20.csv
[INFO] Batch #21 was saved to data/batch_21.csv
[INFO] Bat

In [13]:
# def import_to_mongodb(input_folder='data', mongodb_host='localhost', mongodb_port=27017, db_name='london', collection_name='rides'):
#     # Получаем список файлов в указанной директории
#     files = os.listdir(input_folder)

#     # Итерация по файлам
#     for file_name in files:
#         file_path = os.path.join(input_folder, file_name)

#         # Формируем команду mongoimport
#         command = f'mongoimport --host={mongodb_host} --port={mongodb_port} --db={db_name} --collection={collection_name} --type=csv --headerline --ignoreBlanks --file={file_path}'

#         # Запускаем команду с использованием subprocess
#         subprocess.run(command, shell=True)

#         # Выводим информацию в консоль
#         print(f"[INFO] File: {file_name} was processed to MongoDB Server")

#         # Удаляем файл после обработки
#         os.remove(file_path)
#         print(f"[INFO] File: {file_name} was removed")


def import_to_mongodb(input_folder='data', mongodb_host='localhost', mongodb_port=27017, db_name='london', collection_name='rides_2'):
  """
  Импортирует данные из файлов в MongoDB.

  Args:
    input_folder: Путь к папке с файлами.
    mongodb_host: Адрес хоста MongoDB.
    mongodb_port: Порт MongoDB.
    db_name: Имя базы данных MongoDB.
    collection_name: Имя коллекции MongoDB.
  """

  # Получаем список файлов в указанной директории
  files = os.listdir(input_folder)

  # Итерация по файлам с использованием tqdm
  for file_name in tqdm.tqdm(files):
    file_path = os.path.join(input_folder, file_name)

    # Формируем команду mongoimport
    command = f'mongoimport --host={mongodb_host} --port={mongodb_port} --db={db_name} --collection={collection_name} --type=csv --headerline --ignoreBlanks --file={file_path}'

    # Запускаем команду с использованием subprocess
    subprocess.run(command, shell=True)

    # Выводим информацию в консоль
    # print(f"[INFO] File: {file_name} was processed to MongoDB Server")

    # Удаляем файл после обработки
    os.remove(file_path)
    # print(f"[INFO] File: {file_name} was removed")

In [14]:
import_to_mongodb()

  2%|▏         | 1/50 [00:12<10:16, 12.58s/it]

[INFO] File: batch_1.csv was processed to MongoDB Server
[INFO] File: batch_1.csv was removed


  4%|▍         | 2/50 [00:24<09:56, 12.43s/it]

[INFO] File: batch_10.csv was processed to MongoDB Server
[INFO] File: batch_10.csv was removed


  6%|▌         | 3/50 [00:37<09:40, 12.35s/it]

[INFO] File: batch_11.csv was processed to MongoDB Server
[INFO] File: batch_11.csv was removed


  8%|▊         | 4/50 [00:42<07:19,  9.56s/it]

[INFO] File: batch_12.csv was processed to MongoDB Server
[INFO] File: batch_12.csv was removed


 10%|█         | 5/50 [00:47<06:00,  8.01s/it]

[INFO] File: batch_13.csv was processed to MongoDB Server
[INFO] File: batch_13.csv was removed


 12%|█▏        | 6/50 [01:04<08:03, 10.98s/it]

[INFO] File: batch_14.csv was processed to MongoDB Server
[INFO] File: batch_14.csv was removed


 14%|█▍        | 7/50 [01:46<15:12, 21.23s/it]

[INFO] File: batch_15.csv was processed to MongoDB Server
[INFO] File: batch_15.csv was removed


 16%|█▌        | 8/50 [03:39<35:21, 50.51s/it]

[INFO] File: batch_16.csv was processed to MongoDB Server
[INFO] File: batch_16.csv was removed


 18%|█▊        | 9/50 [03:44<24:41, 36.12s/it]

[INFO] File: batch_17.csv was processed to MongoDB Server
[INFO] File: batch_17.csv was removed


 20%|██        | 10/50 [03:49<17:38, 26.47s/it]

[INFO] File: batch_18.csv was processed to MongoDB Server
[INFO] File: batch_18.csv was removed


 22%|██▏       | 11/50 [03:54<13:00, 20.00s/it]

[INFO] File: batch_19.csv was processed to MongoDB Server
[INFO] File: batch_19.csv was removed


 24%|██▍       | 12/50 [04:02<10:18, 16.28s/it]

[INFO] File: batch_2.csv was processed to MongoDB Server
[INFO] File: batch_2.csv was removed


 26%|██▌       | 13/50 [04:22<10:46, 17.48s/it]

[INFO] File: batch_20.csv was processed to MongoDB Server
[INFO] File: batch_20.csv was removed


 28%|██▊       | 14/50 [04:30<08:47, 14.65s/it]

[INFO] File: batch_21.csv was processed to MongoDB Server
[INFO] File: batch_21.csv was removed


 30%|███       | 15/50 [04:38<07:15, 12.43s/it]

[INFO] File: batch_22.csv was processed to MongoDB Server
[INFO] File: batch_22.csv was removed


 32%|███▏      | 16/50 [04:42<05:44, 10.14s/it]

[INFO] File: batch_23.csv was processed to MongoDB Server
[INFO] File: batch_23.csv was removed


 34%|███▍      | 17/50 [04:47<04:39,  8.48s/it]

[INFO] File: batch_24.csv was processed to MongoDB Server
[INFO] File: batch_24.csv was removed


 36%|███▌      | 18/50 [04:52<03:56,  7.38s/it]

[INFO] File: batch_25.csv was processed to MongoDB Server
[INFO] File: batch_25.csv was removed


 38%|███▊      | 19/50 [04:56<03:21,  6.52s/it]

[INFO] File: batch_26.csv was processed to MongoDB Server
[INFO] File: batch_26.csv was removed


 40%|████      | 20/50 [05:01<03:00,  6.02s/it]

[INFO] File: batch_27.csv was processed to MongoDB Server
[INFO] File: batch_27.csv was removed


 42%|████▏     | 21/50 [05:06<02:41,  5.57s/it]

[INFO] File: batch_28.csv was processed to MongoDB Server
[INFO] File: batch_28.csv was removed


 44%|████▍     | 22/50 [05:11<02:31,  5.41s/it]

[INFO] File: batch_29.csv was processed to MongoDB Server
[INFO] File: batch_29.csv was removed


 46%|████▌     | 23/50 [05:18<02:37,  5.83s/it]

[INFO] File: batch_3.csv was processed to MongoDB Server
[INFO] File: batch_3.csv was removed


 48%|████▊     | 24/50 [05:37<04:16,  9.87s/it]

[INFO] File: batch_30.csv was processed to MongoDB Server
[INFO] File: batch_30.csv was removed


 50%|█████     | 25/50 [05:47<04:11, 10.07s/it]

[INFO] File: batch_31.csv was processed to MongoDB Server
[INFO] File: batch_31.csv was removed


 52%|█████▏    | 26/50 [05:57<03:59,  9.97s/it]

[INFO] File: batch_32.csv was processed to MongoDB Server
[INFO] File: batch_32.csv was removed


 54%|█████▍    | 27/50 [06:02<03:14,  8.45s/it]

[INFO] File: batch_33.csv was processed to MongoDB Server
[INFO] File: batch_33.csv was removed


 56%|█████▌    | 28/50 [06:07<02:43,  7.42s/it]

[INFO] File: batch_34.csv was processed to MongoDB Server
[INFO] File: batch_34.csv was removed


 58%|█████▊    | 29/50 [07:00<07:21, 21.02s/it]

[INFO] File: batch_35.csv was processed to MongoDB Server
[INFO] File: batch_35.csv was removed


 60%|██████    | 30/50 [07:51<10:00, 30.02s/it]

[INFO] File: batch_36.csv was processed to MongoDB Server
[INFO] File: batch_36.csv was removed


 62%|██████▏   | 31/50 [07:57<07:15, 22.92s/it]

[INFO] File: batch_37.csv was processed to MongoDB Server
[INFO] File: batch_37.csv was removed


 64%|██████▍   | 32/50 [08:04<05:24, 18.04s/it]

[INFO] File: batch_38.csv was processed to MongoDB Server
[INFO] File: batch_38.csv was removed


 66%|██████▌   | 33/50 [08:12<04:17, 15.12s/it]

[INFO] File: batch_39.csv was processed to MongoDB Server
[INFO] File: batch_39.csv was removed


 68%|██████▊   | 34/50 [08:34<04:33, 17.12s/it]

[INFO] File: batch_4.csv was processed to MongoDB Server
[INFO] File: batch_4.csv was removed


 70%|███████   | 35/50 [08:42<03:34, 14.32s/it]

[INFO] File: batch_40.csv was processed to MongoDB Server
[INFO] File: batch_40.csv was removed


 72%|███████▏  | 36/50 [08:49<02:48, 12.07s/it]

[INFO] File: batch_41.csv was processed to MongoDB Server
[INFO] File: batch_41.csv was removed


 74%|███████▍  | 37/50 [08:53<02:08,  9.91s/it]

[INFO] File: batch_42.csv was processed to MongoDB Server
[INFO] File: batch_42.csv was removed


 76%|███████▌  | 38/50 [08:58<01:41,  8.42s/it]

[INFO] File: batch_43.csv was processed to MongoDB Server
[INFO] File: batch_43.csv was removed


 78%|███████▊  | 39/50 [09:03<01:21,  7.38s/it]

[INFO] File: batch_44.csv was processed to MongoDB Server
[INFO] File: batch_44.csv was removed


 80%|████████  | 40/50 [09:09<01:08,  6.83s/it]

[INFO] File: batch_45.csv was processed to MongoDB Server
[INFO] File: batch_45.csv was removed


 82%|████████▏ | 41/50 [09:23<01:20,  8.93s/it]

[INFO] File: batch_46.csv was processed to MongoDB Server
[INFO] File: batch_46.csv was removed


 84%|████████▍ | 42/50 [09:47<01:49, 13.68s/it]

[INFO] File: batch_47.csv was processed to MongoDB Server
[INFO] File: batch_47.csv was removed


 86%|████████▌ | 43/50 [09:53<01:18, 11.19s/it]

[INFO] File: batch_48.csv was processed to MongoDB Server
[INFO] File: batch_48.csv was removed


 88%|████████▊ | 44/50 [09:58<00:55,  9.26s/it]

[INFO] File: batch_49.csv was processed to MongoDB Server
[INFO] File: batch_49.csv was removed


 90%|█████████ | 45/50 [10:02<00:39,  7.93s/it]

[INFO] File: batch_5.csv was processed to MongoDB Server
[INFO] File: batch_5.csv was removed


 92%|█████████▏| 46/50 [10:07<00:27,  6.94s/it]

[INFO] File: batch_50.csv was processed to MongoDB Server
[INFO] File: batch_50.csv was removed


 94%|█████████▍| 47/50 [10:13<00:19,  6.54s/it]

[INFO] File: batch_6.csv was processed to MongoDB Server
[INFO] File: batch_6.csv was removed


 96%|█████████▌| 48/50 [10:21<00:13,  6.96s/it]

[INFO] File: batch_7.csv was processed to MongoDB Server
[INFO] File: batch_7.csv was removed


 98%|█████████▊| 49/50 [10:32<00:08,  8.21s/it]

[INFO] File: batch_8.csv was processed to MongoDB Server
[INFO] File: batch_8.csv was removed


100%|██████████| 50/50 [12:07<00:00, 14.54s/it]

[INFO] File: batch_9.csv was processed to MongoDB Server
[INFO] File: batch_9.csv was removed



