In [None]:
import csv
import os
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Define the scope for the Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive']

def authenticate():
    """Authenticate the user using existing credentials and token."""
    creds = None
    # Check if token.json exists to determine if the user has previously authenticated.
    if os.path.exists('token.json'):
        # If token.json exists, load credentials from it.
        creds = Credentials.from_authorized_user_file('token.json')

    # If credentials are missing or invalid, start the authentication flow.
    if not creds or not creds.valid:
        # Check if credentials are expired and refresh if possible.
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            # If no valid credentials exist, initiate the OAuth flow.
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the refreshed or newly obtained credentials to token.json for future use.
        with open('token.json', 'w') as token:
            token.write(creds.to_json())

    # Build the Google Drive service object using the obtained credentials.
    service = build('drive', 'v3', credentials=creds)
    return service

def get_files_in_folder(drive_service, folder_id):
    """
    Retrieve the list of file names in a specific folder in Google Drive.
    """
    files = set()  # Using a set to store unique file names
    page_token = None
    while True:
        response = drive_service.files().list(q=f"'{folder_id}' in parents and trashed=false",
                                              spaces='drive',
                                              fields='nextPageToken, files(name)',
                                              pageToken=page_token).execute()
        for file in response.get('files', []):
            files.add(file['name'])  # Adding file name to the set
        # Get next page token
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            # No more pages, break the loop
            break
    return files

if __name__ == "__main__":
    # Obtain the Google Drive service object
    drive_service = authenticate()

    # Check if drive service is obtained successfully
    if drive_service is None:
        print("Failed to obtain Google Drive service. Exiting.")
        exit(1)

    # Folder ID of the Google Drive folder
    folder_id = '15PrnIvUGB4OdKzSjvGtdpyVLLPlBEZ2M'  # Replace with the actual folder ID

    # Read the CSV file to get the doc_id values
    csv_file_path = 'Jammu.csv'  # Replace with the path to your CSV file
    doc_ids = set()  # Using a set to store unique doc_id values
    with open(csv_file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            doc_ids.add(row['doc_id'])

    # Get the list of file names in the Google Drive folder
    existing_files = get_files_in_folder(drive_service, folder_id)

    # Identify the missing doc_ids
    missing_doc_ids = doc_ids - existing_files

    # Store the missing doc_ids in a text file
    with open('missing_files.txt', 'w') as file:
        for doc_id in missing_doc_ids:
            file.write(doc_id + '\n')

    print("Missing files have been stored in 'missing_files.txt'.")


Missing files have been stored in 'missing_files.txt'.


In [None]:
def count_lines_in_text_file(text_file_path):
    count = 0
    with open(text_file_path, 'r') as file:
        for line in file:
            count += 1
    return count

if __name__ == "__main__":
    text_file_path = 'missing_files_jharkhand.txt'  # Replace with the path to your text file
    line_count = count_lines_in_text_file(text_file_path)
    print(f"Number of lines in the text file: {line_count}")


Number of lines in the text file: 284072
