In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
from datetime import datetime

from collections import defaultdict, OrderedDict

from simple_salesforce import Salesforce
from os import environ
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
logfile = open("TableETL.log","w")
def lprint(val):
    outstr = "[%s] %s" % (datetime.now(), val)
    print(outstr)
    logfile.write(outstr)
    logfile.write("\n")
    logfile.flush()
    
lprint("Started...")

[2023-07-05 09:55:14.533178] Started...


In [3]:
sfusername = environ.get('sfusername')
sfpassword = environ.get('sfpassword')
sfsecret = environ.get("sfsecret")
sfinstanceurl = environ.get("sfinstanceurl")
connstr = environ.get("connstr")

In [4]:
lprint("Creating engine for %s" % connstr)
engine = sa.create_engine(connstr)

[2023-07-05 09:55:14.550112] Creating engine for mssql+pyodbc://KippNOLA


In [5]:
desiredTables = [
    'School_Program__c',
    'ConstituentRole',
    'ContactProfile',
    'Grade_Level__c',
    'Address',
    'AcademicTerm',
    'AcademicYear',
    'Display_School__c',
    'ContactContactRelation',
    'AccountContactRelation',
    'AccountContactRole',
    'AcademicTermEnrollment',
    'Account',
    'Contact'
]

In [6]:
soqlFilters = defaultdict(lambda: "where IsDeleted = false", {
    'AcademicTermEnrollment':'where Active__c = true and IsDeleted = false'
})

In [7]:
lprint("Creating sessions...")
sf = Salesforce(username=sfusername, password=sfpassword, security_token=sfsecret, instance_url=sfinstanceurl, version='57.0')
lprint("Session created!")

[2023-07-05 09:55:14.606185] Creating sessions...
[2023-07-05 09:55:15.154272] Session created!


In [8]:
metaData = {}

for tbl in desiredTables:
    fieldDescs = {}
    
    lprint("Getting metadata for %s" % tbl)
    
    tblDesc = getattr(sf, tbl).describe()
    
    for field in tblDesc['fields']:
        fieldDescs[field['name']] = {
                                'type':field['type'],
                                'length':field['length']
        }
        
    
    metaData[tbl] = fieldDescs
    


[2023-07-05 09:55:15.162219] Getting metadata for School_Program__c
[2023-07-05 09:55:15.702276] Getting metadata for ConstituentRole
[2023-07-05 09:55:15.905742] Getting metadata for ContactProfile
[2023-07-05 09:55:16.102370] Getting metadata for Grade_Level__c
[2023-07-05 09:55:16.353444] Getting metadata for Address
[2023-07-05 09:55:16.628416] Getting metadata for AcademicTerm
[2023-07-05 09:55:16.823734] Getting metadata for AcademicYear
[2023-07-05 09:55:17.022027] Getting metadata for Display_School__c
[2023-07-05 09:55:17.268002] Getting metadata for AccountContactRole


In [9]:
outputData = {}

for tbl in desiredTables:
    lprint("Querying data for %s" % tbl)
    
    feilds = ", ".join(metaData[tbl].keys())
    
    soql = "select %s from %s %s" % (feilds, tbl, soqlFilters[tbl])
    
    
    lprint("Starting query:  %s" % soql)
    resp = sf.query_all(soql)
    
    outputData[tbl] = resp
    
    lprint("Finished %s" % tbl)
    

[2023-07-05 09:55:17.489007] Querying data for School_Program__c
[2023-07-05 09:55:17.489007] Starting query:  select Id, OwnerId, IsDeleted, Name, CreatedDate, CreatedById, LastModifiedDate, LastModifiedById, SystemModstamp, LastActivityDate, LastViewedDate, LastReferencedDate, School__c, Display_School__c, Legacy_Id__c, Public_Group_Id__c, Address__c, City__c, State__c, Street__c, Zip_Postal_Code__c, MIN_Current_Grades__c, MAX_Current_Grades__c, Auto_Grade_Range__c from School_Program__c where IsDeleted = false
[2023-07-05 09:55:17.711311] Finished School_Program__c
[2023-07-05 09:55:17.711311] Querying data for ConstituentRole
[2023-07-05 09:55:17.711311] Starting query:  select Id, OwnerId, IsDeleted, Name, CreatedDate, CreatedById, LastModifiedDate, LastModifiedById, SystemModstamp, LastViewedDate, LastReferencedDate, PersonId, Description, RoleType, Status, EffectiveStartDate, EffectiveEndDate, ContextRecordId from ConstituentRole where IsDeleted = false
[2023-07-05 09:55:17.8747

In [10]:
for tbl in outputData.keys():
    lprint("%s totalRecords %d" % (tbl, outputData[tbl]['totalSize']))

[2023-07-05 09:55:19.753063] School_Program__c totalRecords 12
[2023-07-05 09:55:19.754100] ConstituentRole totalRecords 0
[2023-07-05 09:55:19.754100] ContactProfile totalRecords 0
[2023-07-05 09:55:19.754100] Grade_Level__c totalRecords 62
[2023-07-05 09:55:19.755058] Address totalRecords 0
[2023-07-05 09:55:19.755058] AcademicTerm totalRecords 18
[2023-07-05 09:55:19.756052] AcademicYear totalRecords 2
[2023-07-05 09:55:19.758098] Display_School__c totalRecords 10
[2023-07-05 09:55:19.759044] AccountContactRole totalRecords 0


In [11]:
lprint("Turning output data into dataframes")
dataFrames = {tbl:pd.DataFrame.from_dict(outputData[tbl]['records']) for tbl in desiredTables}

for key in dataFrames.keys():
    
    
    if 'attributes' in dataFrames[key].columns:
        
        lprint("Dropping attributes from %s" % tbl)
        dataFrames[key].drop(columns=['attributes'], inplace=True)
    
    lprint("%s shape %s" % (key, dataFrames[key].shape))

[2023-07-05 09:55:19.772044] Turning output data into dataframes
[2023-07-05 09:55:19.796197] Dropping attributes from AccountContactRole
[2023-07-05 09:55:19.799322] School_Program__c shape (12, 24)
[2023-07-05 09:55:19.800230] ConstituentRole shape (0, 0)
[2023-07-05 09:55:19.800230] ContactProfile shape (0, 0)
[2023-07-05 09:55:19.800230] Dropping attributes from AccountContactRole
[2023-07-05 09:55:19.801279] Grade_Level__c shape (62, 91)
[2023-07-05 09:55:19.801279] Address shape (0, 0)
[2023-07-05 09:55:19.801279] Dropping attributes from AccountContactRole
[2023-07-05 09:55:19.802256] AcademicTerm shape (18, 20)
[2023-07-05 09:55:19.802256] Dropping attributes from AccountContactRole
[2023-07-05 09:55:19.802256] AcademicYear shape (2, 13)
[2023-07-05 09:55:19.803251] Dropping attributes from AccountContactRole
[2023-07-05 09:55:19.803251] Display_School__c shape (10, 110)
[2023-07-05 09:55:19.803251] AccountContactRole shape (0, 0)


In [12]:
for key in dataFrames.keys():
    
    lprint("Converting datetimes for %s..." % key)
    
    for col in dataFrames[key].columns:
        if metaData[key][col]['type'] in ['date', 'datetime']:
            
            lprint("Converting %s to datetime..." % col)
            
            dataFrames[key][col] = pd.to_datetime(dataFrames[key][col])
            
    
    

[2023-07-05 09:55:19.808242] Converting datetimes for School_Program__c...
[2023-07-05 09:55:19.808242] Converting CreatedDate to datetime...
[2023-07-05 09:55:19.810201] Converting LastModifiedDate to datetime...
[2023-07-05 09:55:19.811232] Converting SystemModstamp to datetime...
[2023-07-05 09:55:19.812195] Converting LastActivityDate to datetime...
[2023-07-05 09:55:19.813193] Converting LastViewedDate to datetime...
[2023-07-05 09:55:19.814193] Converting LastReferencedDate to datetime...
[2023-07-05 09:55:19.815187] Converting datetimes for ConstituentRole...
[2023-07-05 09:55:19.816199] Converting datetimes for ContactProfile...
[2023-07-05 09:55:19.816199] Converting datetimes for Grade_Level__c...
[2023-07-05 09:55:19.816199] Converting CreatedDate to datetime...
[2023-07-05 09:55:19.819176] Converting LastModifiedDate to datetime...
[2023-07-05 09:55:19.820174] Converting SystemModstamp to datetime...
[2023-07-05 09:55:19.821173] Converting LastActivityDate to datetime...
[2

In [13]:
for tbl in dataFrames.keys():
    for col in dataFrames[tbl].columns:
        
        if np.count_nonzero(dataFrames[tbl][col].map(type) == OrderedDict) > 0:
            lprint("Ordered dict found in %s column %s converting..." % (tbl, col)) 
            dataFrames[tbl][col] = dataFrames[tbl][col].astype(str)
            
            maxLen = dataFrames[tbl][col].str.len().max()
            
            lprint("Updating length for %s column %s to %d..." % (tbl, col, maxLen)) 
            
            metaData[tbl][col]['length'] = maxLen

In [14]:
staticFields = {
                 'boolean':sa.Boolean,
                 'date':sa.DATE,
                 'datetime':sa.DATETIME,
                 'double': sa.FLOAT,
                 #'email',
                 #'id',
                 'int':sa.INT,
                 #'multipicklist',
                 #'picklist',
                 #'reference',
                 #'string',
                 'textarea':sa.TEXT
}

def getSQLTypes(tbl):
    
    sqlTypes = {}
    
    lprint("Getting SQLTypes for %s" % tbl)
    
    curMeta = metaData[tbl]
        
    for field in curMeta.keys():
        
        if curMeta[field]['type'] in staticFields.keys():
            sqlTypes[field] = staticFields[curMeta[field]['type']]()
            
        else:
            fieldLen = curMeta[field]['length']
            
            if fieldLen <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)
            
            #this is a fix they set some of the custom field max values to weird stuff
            elif np.count_nonzero(~pd.isna(dataFrames[tbl][field])) > 0 \
                            and ( fieldLen := int(dataFrames[tbl][field].str.len().max())) <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)                
                
            else:
                sqlTypes[field] = sa.TEXT()
            
    return sqlTypes
            
#getSQLTypes('AcademicTermEnrollment')


In [15]:
for tbl in desiredTables:
    csvTblName = "SalesForceEduCloud_%s.csv" % tbl
    lprint("Saving %s to %s" % (tbl, csvTblName))

    dataFrames[tbl].to_csv(csvTblName, index=False)

    lprint("Finished uploading %s!" % tbl)

[2023-07-05 09:55:19.947449] Saving School_Program__c to SalesForceEduCloud_School_Program__c.csv
[2023-07-05 09:55:19.960911] Finished uploading School_Program__c!
[2023-07-05 09:55:19.960911] Saving ConstituentRole to SalesForceEduCloud_ConstituentRole.csv
[2023-07-05 09:55:19.961805] Finished uploading ConstituentRole!
[2023-07-05 09:55:19.962802] Saving ContactProfile to SalesForceEduCloud_ContactProfile.csv
[2023-07-05 09:55:19.963799] Finished uploading ContactProfile!
[2023-07-05 09:55:19.963799] Saving Grade_Level__c to SalesForceEduCloud_Grade_Level__c.csv
[2023-07-05 09:55:19.974886] Finished uploading Grade_Level__c!
[2023-07-05 09:55:19.975705] Saving Address to SalesForceEduCloud_Address.csv
[2023-07-05 09:55:19.976750] Finished uploading Address!
[2023-07-05 09:55:19.976750] Saving AcademicTerm to SalesForceEduCloud_AcademicTerm.csv
[2023-07-05 09:55:19.980787] Finished uploading AcademicTerm!
[2023-07-05 09:55:19.982184] Saving AcademicYear to SalesForceEduCloud_Academic

In [16]:
with engine.connect() as conn:

    for tbl in desiredTables:
        
        if dataFrames[tbl].shape[0] == 0:
            lprint("Skipping %s no data!" % tbl)
            continue
        
        sqlTypes = getSQLTypes(tbl)
        
        
        sqlTblName = "SalesForceEduCloud_%s" % tbl
        lprint("Uploading table %s to %s/%s" % (tbl, engine.url, sqlTblName))
        
        dataFrames[tbl].to_sql(sqlTblName, conn, schema='dbo', if_exists='replace', index=False, dtype=sqlTypes)
        
        lprint("Finished uploading %s!" % tbl)

[2023-07-05 09:55:25.142765] Getting SQLTypes for School_Program__c
[2023-07-05 09:55:25.161401] Uploading table School_Program__c to mssql+pyodbc://KippNOLA/SalesForceEduCloud_School_Program__c
[2023-07-05 09:55:28.021291] Finished uploading School_Program__c!
[2023-07-05 09:55:28.022287] Skipping ConstituentRole no data!
[2023-07-05 09:55:28.022287] Skipping ContactProfile no data!
[2023-07-05 09:55:28.023568] Getting SQLTypes for Grade_Level__c
[2023-07-05 09:55:28.034917] Uploading table Grade_Level__c to mssql+pyodbc://KippNOLA/SalesForceEduCloud_Grade_Level__c
[2023-07-05 09:55:42.140513] Finished uploading Grade_Level__c!
[2023-07-05 09:55:42.141736] Skipping Address no data!
[2023-07-05 09:55:42.142546] Getting SQLTypes for AcademicTerm
[2023-07-05 09:55:42.143544] Uploading table AcademicTerm to mssql+pyodbc://KippNOLA/SalesForceEduCloud_AcademicTerm
[2023-07-05 09:55:44.779302] Finished uploading AcademicTerm!
[2023-07-05 09:55:44.780324] Getting SQLTypes for AcademicYear
[20

In [17]:
lprint("=============DONE!===================")



In [18]:
logfile.close()

In [19]:
print("Log file closed")

Log file closed
