In [1]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os

load_dotenv()

mongo_connection_string = os.getenv("MONGO_CONNECTION_STRING")

client = MongoClient(mongo_connection_string) 
db = client['lingwing']  
collection = db['transactions']

In [2]:
from pymongoarrow.schema import Schema
import pymongoarrow
import pyarrow as pa
from bson import ObjectId


transaction_schema = Schema({
    '_id': pymongoarrow.types.ObjectIdType(),  # Represent ObjectId fields as strings
    'country': pa.string(),
    'isPauseTransaction': pa.bool_(),
    'freeTrial': pa.bool_(),
    'temporary': pa.bool_(),
    'boughtWithFreeTrial': pa.bool_(),
    'ip': pa.string(),
    'txn_id': pa.string(),
    'method': pa.int32(),
    'amount': pa.string(),
    'status': pa.int32(),
    'currency': pa.string(),
    'endDate': pa.timestamp('ms'),
    'deviceType': pa.int32(),
    "docInfo": pa.struct([
        ('user', pymongoarrow.types.ObjectIdType()),
    ]),
    "order": pymongoarrow.types.ObjectIdType(),
    "package": pymongoarrow.types.ObjectIdType(),
    'createdAt': pa.timestamp('ms'),
    'updatedAt': pa.timestamp('ms')
    # Note: Undefined fields like paypal, geopay, etc., are excluded.
})


In [3]:
from pymongoarrow.api import find_arrow_all

query = {}  # An empty query fetches all documents
arrow_table = find_arrow_all(collection, query, schema=transaction_schema)

# Convert to pandas DataFrame
df = arrow_table.to_pandas()


In [4]:
df

Unnamed: 0,_id,country,isPauseTransaction,freeTrial,temporary,boughtWithFreeTrial,ip,txn_id,method,amount,status,currency,endDate,deviceType,docInfo,order,package,createdAt,updatedAt
0,581c90da16ee032c2080dfc3,Georgia,,,,,176.221.205.131,,0.0,79.99,2,USD,2017-11-04 13:44:58.289,,{'user': b'W}\xf5\xa8\x9f\xf6#\xfbt\xe0\x1e\xa5'},581c90ca16ee032c2080dfc2,57ebf12b741b1b8053b01382,2016-11-04 13:44:58.291,2020-02-27 20:40:45.030
1,581c961016ee032c2080e035,,,,,,176.221.205.131,,0.0,9.99,2,USD,2016-12-04 14:07:12.783,,{'user': b'X\x1c{\xb52YT\xbc\x0fA\xbc\xf4'},581c960e16ee032c2080e034,57ebf089741b1b8053b0137c,2016-11-04 14:07:12.789,2016-11-04 14:30:02.654
2,581c961c16ee032c2080e037,,,,,,176.221.205.131,,0.0,49.99,2,USD,2017-05-04 14:07:24.247,,{'user': b'X\x1c{\xb52YT\xbc\x0fA\xbc\xf4'},581c961916ee032c2080e036,57ebf0d2741b1b8053b0137f,2016-11-04 14:07:24.249,2016-11-04 14:07:24.249
3,581ca22585a7c2512ad0363f,,,,,,176.221.205.131,,0.0,9.99,2,USD,2016-12-04 14:58:45.163,,{'user': b'X\x1c{\xb52YT\xbc\x0fA\xbc\xf4'},581ca22285a7c2512ad0363e,57ebf089741b1b8053b0137c,2016-11-04 14:58:45.164,2016-11-04 15:20:02.007
4,581cb8f385a7c2512ad03729,Georgia,,,,,82.211.160.199,,0.0,79.99,2,USD,2017-11-04 16:36:03.540,,{'user': b'W\xc0@\xbf\xb3\xc8\xf1\x04\xee\xf7D:'},581cb8ee85a7c2512ad03728,57ebf12b741b1b8053b01382,2016-11-04 16:36:03.543,2020-02-27 20:41:04.054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259156,652f56811963b80bdffb5f64,,,False,,,,,5.0,25.9,2,GEL,2023-11-18 03:52:33.199,0.0,"{'user': b""`D\x9e\xfc\x13R'\x13\xd7\x8aq\xea""}",641297498ee0750f4513ef33,5900c0aef4938479f8ebbd4a,2023-10-18 03:52:33.206,2023-10-18 03:52:34.759
259157,652f64911963b80bdffb5f66,,,False,,,,,5.0,25.9,2,GEL,2023-11-18 04:52:33.392,0.0,{'user': b'd\xaa\xb7V\xed\xa6\xab\x0fu\x17\xc7#'},64b6158df3921a0f5c8ab3f6,5900c0aef4938479f8ebbd4a,2023-10-18 04:52:33.398,2023-10-18 04:52:35.035
259158,652f796c819efa0f2bf10a8c,Georgia,False,True,,,213.200.31.108,,6.0,1,2,GEL,2023-10-23 06:45:03.671,0.0,{'user': b'\\p\x04\x8e\xac\xb2\xda\x0fQ\x1dC\x...,652f796c819efa0f2bf10a8b,5900be4c51f31874e6e704ea,2023-10-18 06:21:32.596,2023-10-18 06:45:03.671
259159,652f83091963b80bdffb5f68,Georgia,,False,,,,,5.0,19.9,0,GEL,2023-11-18 07:02:33.431,0.0,{'user': b'Z\xef\x0c#\xd3\x89j\x0e\xbe\xd4\xcd...,6356472995b24f0e6382db9c,5900c0aef4938479f8ebbd4a,2023-10-18 07:02:33.438,2023-10-18 07:02:35.067


In [5]:
import pandas as pd

# Assuming `df` is your DataFrame
docinfo = pd.json_normalize(df['docInfo'])
#info_df = pd.json_normalize(df['info'])

In [6]:
import binascii

def decode_to_hex(binary_data):
    if isinstance(binary_data, bytes):
        hex_string = binascii.hexlify(binary_data).decode('utf-8')
        return hex_string
    return None

# Apply the custom decoding function to the 'user' column
docinfo['user'] = docinfo['user'].apply(decode_to_hex)


In [7]:
docinfo

Unnamed: 0,user
0,577df5a89ff623fb74e01ea5
1,581c7bb5325954bc0f41bcf4
2,581c7bb5325954bc0f41bcf4
3,581c7bb5325954bc0f41bcf4
4,57c040bfb3c8f104eef7443a
...,...
259156,60449efc13522713d78a71ea
259157,64aab756eda6ab0f7517c723
259158,5c70048eacb2da0f511d43a4
259159,5aef0c23d3896a0ebed4cd95


In [8]:
df['docInfo'] = docinfo

In [9]:
df.to_csv('transactions.csv', index=False)