In [2]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os

load_dotenv()

mongo_connection_string = os.getenv("MONGO_CONNECTION_STRING")

client = MongoClient(mongo_connection_string) 
db = client['lingwing']  
collection = db['your_collection_name']  # Replace with your collection name

In [3]:
collection = db['usercourses']

In [4]:
from pymongoarrow.schema import Schema
import pymongoarrow
import pyarrow as pa
from bson import ObjectId

user_course_schema = Schema({
    'startDate': pa.timestamp('ms', default=pa.scalar(pa.timestamp('ms').now())),
    'endDate': pa.timestamp('ms'),
    'ended': pa.bool_(default=False),
    'certificate': pa.bool_(),
    'countryCode': pa.string(),
    'course': pymongoarrow.types.ObjectIdType(),
    'courseSlug': pa.string(),
    'score': pa.int32(default=0),
    'sentTasksCount': pa.int32(default=0),
    'shouldBeSent': pa.int32(default=-1),
    'uniquePassedTasks': pa.int32(default=0),
    'reloaded': pa.bool_(default=False),
    'totalWrongAnswers': pa.int32(),
    'totalRightAnswers': pa.int32(),
    'uniqueTasksRightAnswers': pa.int32(),
    'uniqueTasksWrongAnswers': pa.int32(),
    'totalTimeSpent': pa.int64(),
    'iLearnFromNameCode': pa.string(),
    'heart': pa.struct([
        ('count', pa.int32(default=3)),
        ('temporaryScore', pa.int32(default=0))
    ]),
    'allPassedTasks': pa.int32(default=0),
    'reachedDailyLimit': pa.bool_(default=False),
    'passedReplayPoints': pa.int32(default=1),
    'percent': pa.string(default='0'),
    'oldUserCourseId': pymongoarrow.types.ObjectIdType(),
    'oldEnded': pa.bool_(),
    'oldCertificate': pa.bool_(),
    'replayTasksObjectIds': pa.list_(pymongoarrow.types.ObjectIdType()),
    'lastSentTaskIds': pa.list_(pymongoarrow.types.ObjectIdType()),
    'sentDailyTaskCounter': pa.int32(default=0),
    'dailyReachedLimitDate': pa.timestamp('ms', default=pa.scalar(pa.timestamp('ms').now())),
    'vote': pa.struct([
        ('voted', pa.bool_(default=False)),
        ('amount', pa.int32())
    ]),
    'rating': pa.struct([
        ('total', pymongoarrow.types.MixedType()),
        ('task', pa.struct([
            ('mistake', pymongoarrow.types.MixedType()),
            ('time', pymongoarrow.types.MixedType())
        ]))
    ]),
    'docInfo': pa.struct([
        ('createDate', pa.timestamp('ms', default=pa.scalar(pa.timestamp('ms').now()))),
        ('user', pymongoarrow.types.ObjectIdType()),
        ('unRegisteredUser', pymongoarrow.types.ObjectIdType())
    ]),
    'delete': pa.struct([
        ('isDeleted', pa.string(default='false')),
        ('deleteDate', pa.timestamp('ms')),
        ('user', pymongoarrow.types.ObjectIdType())
    ]),
    'statistics': pa.struct([
        ('daily', pa.list_(pa.struct([
            ('date', pa.timestamp('ms')),
            ('rating', pa.struct([
                ('total', pymongoarrow.types.MixedType(default=0)),
                ('task', pa.struct([
                    ('mistake', pymongoarrow.types.MixedType()),
                    ('time', pymongoarrow.types.MixedType())
                ]))
            ]))
        ]))),
        ('lastWeekDate', pa.timestamp('ms')),
        ('lastWeekRating', pa.int32()),
        ('previousWeekDate', pa.timestamp('ms')),
        ('previousWeekRating', pa.int32()),
        ('weekly', pa.list_(pa.struct([
            ('date', pa.timestamp('ms')),
            ('rating', pa.struct([
                ('total', pymongoarrow.types.MixedType(default=0)),
                ('task', pa.struct([
                    ('mistake', pymongoarrow.types.MixedType()),
                    ('time', pymongoarrow.types.MixedType())
                ]))
            ]))
        ]))),
        ('latest', pa.struct([
            ('date', pa.timestamp('ms')),
            ('rating', pa.struct([
                ('total', pymongoarrow.types.MixedType(default=0)),
                ('task', pa.struct([
                    ('mistake', pymongoarrow.types.MixedType()),
                    ('time', pymongoarrow.types.MixedType())
                ]))
            ]))
        ]))
    ]),
    'learnMode': pa.int32(default=3),
    'userCourseType': pa.int32(default=0),
    'maxTestTaskCount': pa.int32(),
    'freeTest': pa.bool_(),
    'stats': pymongoarrow.types.MixedType(),
    'tasks': pa.list_(pymongoarrow.types.MixedType()),
    'passedTasks': pa.list_(pa.struct([
        ('taskId', pymongoarrow.types.ObjectIdType()),
        ('sequentialNumber', pa.int32()),
        ('mistake', pa.struct([
            ('sum', pa.int32()),
            ('hint', pa.int32()),
            ('audio', pa.int32()),
            ('typo', pa.int32()),
            ('check', pa.int32())
        ])),
        ('score', pa.int32()),
        ('wholeScore', pa.int32()),
        ('timeSpent', pa.int32()),
        ('wholeTimeSpent', pa.int32())
    ]))
})


AttributeError: 'pyarrow.lib.TimestampType' object has no attribute 'now'

In [4]:
from pymongoarrow.api import find_arrow_all

query = {}  # An empty query fetches all documents
arrow_table = find_arrow_all(collection, query, schema=user_schema)

# Convert to pandas DataFrame
df = arrow_table.to_pandas()


In [9]:
df.to_csv('users.csv', index=False)

In [10]:
#copy_df = df

In [16]:
df = copy_df

In [17]:
import pandas as pd

def normalize_column(df, col, prefix):
    """ Normalizes a column with potentially nested data """
    # Create an empty DataFrame for normalized data
    normalized_df = pd.DataFrame()

    for i, row in df.iterrows():
        # Extract the data for the current column
        data = row[col]

        # Check the data type and normalize accordingly
        if isinstance(data, list):
            # Normalize each item in the list and concatenate
            normalized_list = pd.concat([pd.json_normalize(item) for item in data if item is not None], sort=False).reset_index(drop=True)
            normalized_list = normalized_list.add_prefix(f"{prefix}.")
            normalized_df = pd.concat([normalized_df, normalized_list], sort=False)
        elif isinstance(data, dict):
            # Normalize the dictionary
            normalized_dict = pd.json_normalize(data)
            normalized_dict = normalized_dict.add_prefix(f"{prefix}.")
            normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
        else:
            # Handle non-dict and non-list data types
            normalized_df = pd.concat([normalized_df, pd.DataFrame([{f"{prefix}": data}])], sort=False)

    # Reset index for consistency
    normalized_df.reset_index(drop=True, inplace=True)
    return pd.concat([df.reset_index(drop=True), normalized_df], axis=1)

# Apply normalization to each nested column
nested_cols = ['freeTrial', 'local', 'facebook', 'google', 'twitter', 'info', 'profile', 'emailLogs', 'smsLogs', 'delete', 'tutorial', 'creditCard']
for col in nested_cols:
    df = normalize_column(df, col, col)
    df.drop(columns=[col], inplace=True)

# Handle sub-nested fields similarly if needed

# Final DataFrame
df.reset_index(drop=True, inplace=True)


  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalized_df, normalized_dict], sort=False)
  normalized_df = pd.concat([normalize

In [None]:
df