In [2]:
import pandas as pd

In [3]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

## Support functions

In [3]:
def run_on_drive_api(func):
    def wrapper(*args, **kwargs):
        try:
            service = build('drive', 'v3',
                            developerKey='AIzaSyBU7WIsMv5LvRD3xt8ZnV5fqfMY40iEHS8')
            return func(*args, service=service, **kwargs)
        except HttpError as error:
            print(f'An error occurred:\n\n{error}')
    return wrapper

In [4]:
def is_essay_file(drive_item):
    return any(x in drive_item['mimeType'] for x in ['image', 'pdf'])

In [5]:
@run_on_drive_api
def recreate_file_tree(initial_folder_id, service=None):
    query = f"'{initial_folder_id}' in parents"
    results = service.files().list(q=query).execute()
    items = results.get('files')
    file_items = {}

    for item in items:
        if item['mimeType'] == 'application/vnd.google-apps.folder':
            file_items[item['name']] = recreate_file_tree(item['id'])
        else:
            file_items[item['name']] = item

    return file_items

In [6]:
file_items = recreate_file_tree('1NxxkhOfH4eB0v6E0leGtG5hHo2amYKmG')
file_items

{'Fuvest': {'50.pdf': {'kind': 'drive#file',
   'id': '1SziOtl8xeATY3cfA2VXf9NRLK0-b3ENg',
   'name': '50.pdf',
   'mimeType': 'application/pdf'},
  '2018': {'36,5_309.jpg': {'kind': 'drive#file',
    'id': '1ThLLNWnUHgGq55ElFVUFVysSem8o83HY',
    'name': '36,5_309.jpg',
    'mimeType': 'image/jpeg'},
   '33 _310.jpg': {'kind': 'drive#file',
    'id': '1J6c8CNdx6Yy1ye0gK8DLo1Oo3yUL3DLK',
    'name': '33 _310.jpg',
    'mimeType': 'image/jpeg'}},
  '2020': {'red_fuvest.py': {'kind': 'drive#file',
    'id': '1Q3ELr5i0oSzvptc_l9CgMB8kYPH2uFFb',
    'name': 'red_fuvest.py',
    'mimeType': 'text/x-python'},
   '38,5-Red64 (Small).png': {'kind': 'drive#file',
    'id': '1pJ5_z-00dpX4mbsPSzUsLX4pKFuWE434',
    'name': '38,5-Red64 (Small).png',
    'mimeType': 'image/png'},
   '42,0-Red74 (Small).png': {'kind': 'drive#file',
    'id': '1p2WISpPRTiRxxb8uzyyf3Qbkd2OgkaiS',
    'name': '42,0-Red74 (Small).png',
    'mimeType': 'image/png'},
   '43,0_32 (Small).png': {'kind': 'drive#file',
    'i

In [7]:
print(file_items.keys())

dict_keys(['Fuvest', 'Enem'])


In [12]:
enem_tree = file_items['Enem']
fuvest_tree = file_items['Fuvest']

ENEM_COLS = [
    'ano',
    'nota',
    'c1',
    'c2',
    'c3',
    'c4',
    'c5',
    'drive_id',
    'url',
]

FUVEST_COLS = [
    'ano',
    'nota',
    'drive_id',
    'url',
]

In [13]:
DRIVE_VIEW_BASE_URL = 'https://drive.google.com/file/d/{}/view'

In [14]:
def parse_enem_essays(filetree, base_view_url, columns):
    data = []
    for raw_year, file_dict in filetree.items():
        year = raw_year.split('.')[0]
        essay_dict = {filename: item for filename, item in file_dict.items() if is_essay_file(item)}
        for filename, drive_item in essay_dict.items():
            marks = filename.split('_')[0].split('-')
            if len(marks) > 1:
                final_marks = marks
            else:
                final_marks = [marks[0], None, None, None, None, None]
            file_view_url = base_view_url.format(drive_item['id'])
            data.append([year, *final_marks, drive_item['id'], file_view_url])
    
    return pd.DataFrame(data=data, columns=columns).convert_dtypes()

In [15]:
enem_parsed = parse_enem_essays(enem_tree, DRIVE_VIEW_BASE_URL, ENEM_COLS)
enem_parsed

Unnamed: 0,ano,nota,c1,c2,c3,c4,c5,drive_id,url
0,2020,920,160,200,160,200,200,1CA5BuMVUlOGedcrzcyfSHRhuU2ksJCOk,https://drive.google.com/file/d/1CA5BuMVUlOGed...
1,2020,960,180,200,180,200,200,1ozFxlkdFl0nkXdklVV7I75uLX0G_R_qx,https://drive.google.com/file/d/1ozFxlkdFl0nkX...
2,2020,900,180,180,200,180,160,1nxDup_a1LX1Tm61isRt8eVBNaNCDfMiR,https://drive.google.com/file/d/1nxDup_a1LX1Tm...
3,2020,960,180,200,180,200,200,1Y65pUXqvHbMKadsZCNdNWTO2YXeDrdpx,https://drive.google.com/file/d/1Y65pUXqvHbMKa...
4,2020,860,160,200,160,180,160,1E7PcM9kXvfiJ_EEZBLE5_ovssz-6KEZ8,https://drive.google.com/file/d/1E7PcM9kXvfiJ_...
...,...,...,...,...,...,...,...,...,...
72,2019,920,180,180,180,180,200,1S6AINcBoapgVZPDIWTTwwfrsIzAUjgmR,https://drive.google.com/file/d/1S6AINcBoapgVZ...
73,2019,880,160,180,200,180,160,154-xrk5XA5FjtvgJxAkCG-2dLWZLwp0s,https://drive.google.com/file/d/154-xrk5XA5Fjt...
74,2019,900,160,200,200,160,180,1JCZrzP4ZAAMkmV6usdnrL8ppbS8qePQV,https://drive.google.com/file/d/1JCZrzP4ZAAMkm...
75,2019,840,140,180,160,180,180,1YrWMfyafkwtj7RWuXCaAKaqsFjU3Oofu,https://drive.google.com/file/d/1YrWMfyafkwtj7...


In [16]:
def parse_fuvest_essays(filetree, base_view_url, columns):
    data = []
    for year, file_dict in filetree.items():
        if year != '50.pdf': # One file was misplaced
            essay_dict = {filename: item for filename, item in file_dict.items() if is_essay_file(item)}
            for filename, drive_item in essay_dict.items():
                mark = filename.split('_')[0].split('-')[0].replace(',', '.')
                file_view_url = base_view_url.format(drive_item['id'])
                data.append([year, mark, drive_item['id'], file_view_url])
    
    return pd.DataFrame(data=data, columns=columns).convert_dtypes()

In [17]:
fuvest_parsed = parse_fuvest_essays(fuvest_tree, DRIVE_VIEW_BASE_URL, FUVEST_COLS)
fuvest_parsed

Unnamed: 0,ano,nota,drive_id,url
0,2018,36.5,1ThLLNWnUHgGq55ElFVUFVysSem8o83HY,https://drive.google.com/file/d/1ThLLNWnUHgGq5...
1,2018,33,1J6c8CNdx6Yy1ye0gK8DLo1Oo3yUL3DLK,https://drive.google.com/file/d/1J6c8CNdx6Yy1y...
2,2020,38.5,1pJ5_z-00dpX4mbsPSzUsLX4pKFuWE434,https://drive.google.com/file/d/1pJ5_z-00dpX4m...
3,2020,42.0,1p2WISpPRTiRxxb8uzyyf3Qbkd2OgkaiS,https://drive.google.com/file/d/1p2WISpPRTiRxx...
4,2020,43.0,1y0lhK2DInBrNr29fNcmR1jUo3SZKhjZU,https://drive.google.com/file/d/1y0lhK2DInBrNr...
...,...,...,...,...
127,2019,40.0,1QxwnC7lRJNdWG8VHcgGIFiwEgvumglDs,https://drive.google.com/file/d/1QxwnC7lRJNdWG...
128,2019,38.0,1W7ThT5NwhBw0S7ZKrt5Q0r3P0yKAkc2B,https://drive.google.com/file/d/1W7ThT5NwhBw0S...
129,2019,47.0,1B9TUmIT_23DROVS7LDyaxKV7nGgY4g7W,https://drive.google.com/file/d/1B9TUmIT_23DRO...
130,2019,35.0,1_CvW_Kz4x0ROdP0DQcSYh_YGtjfRaA9F,https://drive.google.com/file/d/1_CvW_Kz4x0ROd...


In [18]:
enem_parsed.to_csv('../../data/4_final/redacoes/enem/2020.csv', index=None)
fuvest_parsed.to_csv('../../data/4_final/redacoes/fuvest/2020.csv', index=None)

## Experimentation

In [6]:
def drive_api_key():
    """Shows basic usage of the Drive v3 API.
    Prints the names and ids of the first 10 files the user has access to.
    """
    try:
        service = build('drive', 'v3',
                        developerKey='AIzaSyBU7WIsMv5LvRD3xt8ZnV5fqfMY40iEHS8')

        # Call the Drive v3 API
        folder_id = '1NxxkhOfH4eB0v6E0leGtG5hHo2amYKmG'
        query = f"'{folder_id}' in parents"
        results = service.files().list(q=query).execute()
        items = results.get('files')

        if not items:
            print('No files found.')
            return
        print('Files:')
        print(type(items))
        for item in items:
            print(u'{0} ({1})'.format(item['name'], item['id']))
            print(item['mimeType'])
            if item['mimeType'] == 'application/vnd.google-apps.folder':
                new_result = service.files().list(q=f"'{item['id']}' in parents").execute()
                new_items = new_result.get('files')
                for new_item in new_items:
                    print(u'{0} ({1})'.format(new_item['name'], new_item['id']))
                    print(new_item['mimeType'])

    except HttpError as error:
        # TODO(developer) - Handle errors from drive API.
        print(f'An error occurred:\n\n{error}')

In [7]:
drive_api_key()

Files:
<class 'list'>
Fuvest (14GBnussetkKp_PUjjyXOM1Z9G4-KEyCE)
application/vnd.google-apps.folder
50.pdf (1SziOtl8xeATY3cfA2VXf9NRLK0-b3ENg)
application/pdf
2018 (1INWnsAKSZH-AcRB9gHCyRbxBbozvUIs1)
application/vnd.google-apps.folder
2020 (1Dxc8x7ogmfFT4RxMOcGqhb0DCgDQa1Ei)
application/vnd.google-apps.folder
2019 (1Bf0BQ1OCwiZFYJVAE1ua-8CPVaOrcpx_)
application/vnd.google-apps.folder
Enem (1pPdte9umchy4HdEyFf7IbLHeTNerWXP2)
application/vnd.google-apps.folder
2020 (1HeGrZN09iPYrhmn-ZOcCWhMzuxuA2pPO)
application/vnd.google-apps.folder
2018.1 (1ypayvnq4pjEHObVmZNuVqKwg51MsnBiV)
application/vnd.google-apps.folder
2016 (1IiDtYdqpX5hoB_zMUU47A-NTH1X-IicD)
application/vnd.google-apps.folder
2018 (1drPlWbQV4MsCXGVIRKmFxsnSSULJrW3X)
application/vnd.google-apps.folder
2019.1 (1wRcsWQwgoJNWJNEZ0MAi1GKI7VHtVFqo)
application/vnd.google-apps.folder
2019 (1rV403v8H6lyf6XAQvpZcPZOOXfo1k2mC)
application/vnd.google-apps.folder
2020.1 (1rirpbY1EVdfHeh0bms2uClf6hPJX7z2Z)
application/vnd.google-apps.folder

In [88]:
def drive_service():
    SCOPES = ['https://www.googleapis.com/auth/drive']
    SERVICE_ACCOUNT_FILE = 'service.json'
    credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)

    try:
        service = build('drive', 'v3', credentials=credentials)

        # Call the Drive v3 API

        # file_path = '/home/tomaz/Downloads/Apostila---Produtividade-Na-Pratica-Online.pdf'
        # file_metadata = {'name': 'pdf_exemplo.pdf'}
        # media = MediaFileUpload(file_path, mimetype='application/pdf')
        # file = service.files().create(body=file_metadata,
        #                                     media_body=media,
        #                                     fields='id').execute()
        #
        # result = service.about().get(fields='storageQuota').execute()
        # print(result)

        # results = service.files().list(
        #     pageSize=10).execute()
        # items = results.get('files', [])
        #
        # if not items:
        #     print('No files found.')
        #     return
        # print('Files:')
        # for item in items:
        #     print(u'{0} ({1})'.format(item['name'], item['id']))

        # file_id = '1-H2egFbxXfJPM2EifumTIJMwQH9Q5DNU'
        # permissions = {
        #     'type': 'anyone',
        #     'role': 'reader',
        # }
        # service.permissions().create(
        #     fileId=file_id,
        #     body=permissions,
        #     fields='id',
        # ).execute()

        from googleapiclient.http import MediaIoBaseDownload

        file_id = '1-H2egFbxXfJPM2EifumTIJMwQH9Q5DNU'
        request = service.files().get_media(fileId=file_id)
        with open('example.pdf', 'wb') as output:
            downloader = MediaIoBaseDownload(output, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print(f"Download {int(status.progress() * 100)}")

    except HttpError as error:
        # TODO(developer) - Handle errors from drive API.
        print(f'An error occurred: {error}')

In [89]:
drive_service()

Download 100
