In [None]:
import os
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Define the scope for the Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive']

def get_drive_service():
    creds = None
    # Check if token.json exists to determine if the user has previously authenticated.
    if os.path.exists('token.json'):
        # If token.json exists, load credentials from it.
        creds = Credentials.from_authorized_user_file('token.json')

    # If credentials are missing or invalid, start the authentication flow.
    if not creds or not creds.valid:
        # Check if credentials are expired and refresh if possible.
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            # If no valid credentials exist, initiate the OAuth flow.
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the refreshed or newly obtained credentials to token.json for future use.
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    # Build the Google Drive service object using the obtained credentials.
    service = build('drive', 'v3', credentials=creds)
    return service

def get_files_count_in_folder(drive_service, folder_id):
    """
    Retrieve the count of files in a specific folder in Google Drive.
    """
    count = 0
    page_token = None
    while True:
        response = drive_service.files().list(q=f"'{folder_id}' in parents and trashed=false",
                                              spaces='drive',
                                              fields='nextPageToken, files(name)',
                                              pageToken=page_token).execute()
        count += len(response.get('files', []))  # Increment count by the number of files in the current page
        # Get next page token
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            # No more pages, break the loop
            break
    return count

if __name__ == "__main__":
    # Obtain the Google Drive service object
    drive_service = get_drive_service()

    # Check if drive service is obtained successfully
    if drive_service is None:
        print("Failed to obtain Google Drive service. Exiting.")
        exit(1)

    # Folder ID for which you want to get the count of files
    folder_id = '1arWU0xWTOUv43y8AKLTFHObYgReJKwjT'  # Replace with the actual folder ID

    # Get the count of files in the specified folder
    files_count = get_files_count_in_folder(drive_service, folder_id)

    # Display the count of files
    print(f"Number of files in folder {folder_id}: {files_count}")


Number of files in folder 1arWU0xWTOUv43y8AKLTFHObYgReJKwjT: 49542


In [None]:
import csv

def count_unique_case_ids(csv_file_path):
    unique_case_ids = set()
    with open(csv_file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            case_id = row['doc_id']
            unique_case_ids.add(case_id)
    return len(unique_case_ids)

if __name__ == "__main__":
    csv_file_path = 'Jharkhand.csv'  # Replace with the path to your CSV file
    total_case_id_count = count_unique_case_ids(csv_file_path)
    print(f"Total number of unique case_ids: {total_case_id_count}")


Total number of unique case_ids: 284072


In [None]:
import csv
import os
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Define the scope for the Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive']

def authenticate():
    """Authenticate the user using existing credentials and token."""
    creds = None
    # Check if token.json exists to determine if the user has previously authenticated.
    if os.path.exists('token.json'):
        # If token.json exists, load credentials from it.
        creds = Credentials.from_authorized_user_file('token.json')

    # If credentials are missing or invalid, start the authentication flow.
    if not creds or not creds.valid:
        # Check if credentials are expired and refresh if possible.
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            # If no valid credentials exist, initiate the OAuth flow.
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the refreshed or newly obtained credentials to token.json for future use.
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    # Build the Google Drive service object using the obtained credentials.
    service = build('drive', 'v3', credentials=creds)
    return service

def get_files_in_folder(drive_service, folder_id):
    """
    Retrieve the list of file names in a specific folder in Google Drive.
    """
    files = set()  # Using a set to store unique file names
    page_token = None
    while True:
        response = drive_service.files().list(q=f"'{folder_id}' in parents and trashed=false",
                                              spaces='drive',
                                              fields='nextPageToken, files(name)',
                                              pageToken=page_token).execute()
        for file in response.get('files', []):
            files.add(file['name'])  # Adding file name to the set
        # Get next page token
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            # No more pages, break the loop
            break
    return files

if __name__ == "__main__":
    # Obtain the Google Drive service object
    drive_service = authenticate()

    # Check if drive service is obtained successfully
    if drive_service is None:
        print("Failed to obtain Google Drive service. Exiting.")
        exit(1)

    # Folder ID of the Google Drive folder
    folder_id = '1qpWWufkZ4ciCskmJ3xPHLe72Z8oKWjcO'  # Replace with the actual folder ID

    # Path to the CSV file containing doc_id values
    csv_file_path = 'allahabad.csv'  # Replace with the path to your CSV file

    # Read the CSV file to get the doc_id values
    doc_ids = set()  # Using a set to store unique doc_id values
    with open(csv_file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            doc_ids.add(row['doc_id'])

    # Get the list of file names in the Google Drive folder
    existing_files = get_files_in_folder(drive_service, folder_id)

    # Compare doc_id values with file names in Google Drive folder
    missing_doc_ids = doc_ids - existing_files

    # Print missing doc_ids
    print("Missing doc_ids in Google Drive folder:")
    for doc_id in missing_doc_ids:
        print(doc_id)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
111946236
176459878
146303106
92903891
148800902
88907267
953898
125006948
26852739
85832094
11224979
192969237
121196127
123453975
194776
9858425
168664207
134177514
126748707
139188514
88412959
78935785
328292
34438994
1436396
17833409
46198252
129679596
188085393
293770
171716171
20250300
162979670
133248759
76064226
74535061
197331670
128567893
124159217
183456406
49231774
26713787
192117716
71229501
175910390
380103
155390304
1169874
196809200
123000994
106420311
28756419
1288824
121900711
122995133
47724757
4676643
38058003
120434312
142885598
178741637
57725490
137762822
52463491
164070674
25403
153633081
181831
136417901
12937496
177216573
196846779
120455144
132794708
105419333
191160974
15862675
178894679
114444
115768277
1951702
43881060
168201383
123033132
187223022
46291072
1484439
25542556
534557
112171471
589567
113070815
102890927
177955360
1882020
31092583
34223801
198950816
138532579
45997092
63696433
19