In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
from datetime import datetime

from collections import defaultdict, OrderedDict

from simple_salesforce import Salesforce
from os import environ
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
logfile = open("TableETL.log","w")
def lprint(val):
    outstr = "[%s] %s" % (datetime.now(), val)
    print(outstr)
    logfile.write(outstr)
    logfile.write("\n")
    logfile.flush()
    
lprint("Started...")

[2023-07-03 17:15:33.375954] Started...


In [3]:
sfusername = environ.get('sfusername')
sfpassword = environ.get('sfpassword')
sfsecret = environ.get("sfsecret")
sfinstanceurl = environ.get("sfinstanceurl")
connstr = environ.get("connstr")

In [4]:
lprint("Creating engine for %s" % connstr)
engine = sa.create_engine(connstr)

[2023-07-03 17:15:33.404181] Creating engine for mssql+pyodbc://KippNOLA


In [5]:
desiredTables = [
    'ContactContactRelation',
    'AccountContactRelation',
    'AcademicTermEnrollment',
    'Account',
    'Contact'
]

In [6]:
soqlFilters = defaultdict(lambda: "where IsDeleted = false", {
    'AcademicTermEnrollment':'where Active__c = true and IsDeleted = false'
})

In [7]:
lprint("Creating sessions...")
sf = Salesforce(username=sfusername, password=sfpassword, security_token=sfsecret, instance_url=sfinstanceurl, version='57.0')
lprint("Session created!")

[2023-07-03 17:15:33.480092] Creating sessions...
[2023-07-03 17:15:34.033528] Session created!


In [8]:
metaData = {}

for tbl in desiredTables:
    fieldDescs = {}
    
    lprint("Getting metadata for %s" % tbl)
    
    tblDesc = getattr(sf, tbl).describe()
    
    for field in tblDesc['fields']:
        fieldDescs[field['name']] = {
                                'type':field['type'],
                                'length':field['length']
        }
        
    
    metaData[tbl] = fieldDescs
    


[2023-07-03 17:15:34.040537] Getting metadata for ContactContactRelation
[2023-07-03 17:15:34.559842] Getting metadata for AccountContactRelation


In [9]:
outputData = {}

for tbl in desiredTables:
    lprint("Querying data for %s" % tbl)
    
    feilds = ", ".join(metaData[tbl].keys())
    
    soql = "select %s from %s %s" % (feilds, tbl, soqlFilters[tbl])
    
    
    lprint("Starting query:  %s" % soql)
    resp = sf.query_all(soql)
    
    outputData[tbl] = resp
    
    lprint("Finished %s" % tbl)
    

[2023-07-03 17:15:34.717371] Querying data for ContactContactRelation
[2023-07-03 17:15:34.717371] Starting query:  select Id, OwnerId, IsDeleted, Name, CreatedDate, CreatedById, LastModifiedDate, LastModifiedById, SystemModstamp, LastViewedDate, LastReferencedDate, ContactId, RelatedContactId, RelatedInverseRecordId, StartDate, EndDate, IsActive, PartyRoleRelationId, HierarchyType, Include_Role_in_Sharing__c, Related_Contact_School_Id__c, Related_Person_Name__c, Related_Person_Role__c, Legacy_Id_A__c, Legacy_Id_B__c, Person__c, Related_Person__c, Updated_By_Reciprocal__c, Related_Contact_School__c, Related_Person_Grade_Level__c, Is_Twin__c from ContactContactRelation where IsDeleted = false
[2023-07-03 17:15:34.848835] Finished ContactContactRelation
[2023-07-03 17:15:34.848835] Querying data for AccountContactRelation
[2023-07-03 17:15:34.848835] Starting query:  select Id, AccountId, ContactId, Roles, IsDirect, IsPrimaryMember, IsActive, StartDate, EndDate, IsDeleted, IsPrimaryGroup

In [10]:
for tbl in outputData.keys():
    lprint("%s totalRecords %d" % (tbl, outputData[tbl]['totalSize']))

[2023-07-03 17:15:39.807397] ContactContactRelation totalRecords 0
[2023-07-03 17:15:39.807397] AccountContactRelation totalRecords 0


In [11]:
lprint("Turning output data into dataframes")
dataFrames = {tbl:pd.DataFrame.from_dict(outputData[tbl]['records']) for tbl in desiredTables}

for key in dataFrames.keys():
    
    
    if 'attributes' in dataFrames[key].columns:
        
        lprint("Dropping attributes from %s" % tbl)
        dataFrames[key].drop(columns=['attributes'], inplace=True)
    
    lprint("%s shape %s" % (key, dataFrames[key].shape))

[2023-07-03 17:15:39.817317] Turning output data into dataframes
[2023-07-03 17:15:39.820935] ContactContactRelation shape (0, 0)
[2023-07-03 17:15:39.821158] AccountContactRelation shape (0, 0)


In [12]:
for key in dataFrames.keys():
    
    lprint("Converting datetimes for %s..." % key)
    
    for col in dataFrames[key].columns:
        if metaData[key][col]['type'] in ['date', 'datetime']:
            
            lprint("Converting %s to datetime..." % col)
            
            dataFrames[key][col] = pd.to_datetime(dataFrames[key][col])
            
    
    

[2023-07-03 17:15:39.832726] Converting datetimes for ContactContactRelation...
[2023-07-03 17:15:39.832726] Converting datetimes for AccountContactRelation...


In [13]:
for tbl in dataFrames.keys():
    for col in dataFrames[tbl].columns:
        
        if np.count_nonzero(dataFrames[tbl][col].map(type) == OrderedDict) > 0:
            lprint("Ordered dict found in %s column %s converting..." % (tbl, col)) 
            dataFrames[tbl][col] = dataFrames[tbl][col].astype(str)
            
            maxLen = dataFrames[tbl][col].str.len().max()
            
            lprint("Updating length for %s column %s to %d..." % (tbl, col, maxLen)) 
            
            metaData[tbl][col]['length'] = maxLen

In [14]:
staticFields = {
                 'boolean':sa.Boolean,
                 'date':sa.DATE,
                 'datetime':sa.DATETIME,
                 'double': sa.FLOAT,
                 #'email',
                 #'id',
                 'int':sa.INT,
                 #'multipicklist',
                 #'picklist',
                 #'reference',
                 #'string',
                 'textarea':sa.TEXT
}

def getSQLTypes(tbl):
    
    sqlTypes = {}
    
    lprint("Getting SQLTypes for %s" % tbl)
    
    curMeta = metaData[tbl]
        
    for field in curMeta.keys():
        
        if curMeta[field]['type'] in staticFields.keys():
            sqlTypes[field] = staticFields[curMeta[field]['type']]()
            
        else:
            fieldLen = curMeta[field]['length']
            
            if fieldLen <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)
            
            #this is a fix they set some of the custom field max values to weird stuff
            elif np.count_nonzero(~pd.isna(dataFrames[tbl][field])) > 0 \
                            and ( fieldLen := int(dataFrames[tbl][field].str.len().max())) <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)                
                
            else:
                sqlTypes[field] = sa.TEXT()
            
    return sqlTypes
            
#getSQLTypes('AcademicTermEnrollment')


In [15]:
for tbl in desiredTables:
    csvTblName = "SalesForceEduCloud_%s.csv" % tbl
    lprint("Saving %s to %s" % (tbl, csvTblName))

    dataFrames[tbl].to_csv(csvTblName, index=False)

    lprint("Finished uploading %s!" % tbl)

[2023-07-03 17:15:39.861780] Saving ContactContactRelation to SalesForceEduCloud_ContactContactRelation.csv
[2023-07-03 17:15:39.868907] Finished uploading ContactContactRelation!
[2023-07-03 17:15:39.869904] Saving AccountContactRelation to SalesForceEduCloud_AccountContactRelation.csv
[2023-07-03 17:15:39.870963] Finished uploading AccountContactRelation!


In [20]:
with engine.connect() as conn:

    for tbl in desiredTables:
        
        if dataFrames[tbl].shape[0] == 0:
            lprint("Skipping %s no data!" % tbl)
            continue
        
        sqlTypes = getSQLTypes(tbl)
        
        
        sqlTblName = "SalesForceEduCloud_%s" % tbl
        lprint("Uploading table %s to %s/%s" % (tbl, engine.url, sqlTblName))
        
        dataFrames[tbl].to_sql(sqlTblName, conn, schema='dbo', if_exists='replace', index=False, dtype=sqlTypes)
        
        lprint("Finished uploading %s!" % tbl)

[2023-07-03 17:16:23.994081] Skipping ContactContactRelation no data!


ValueError: I/O operation on closed file.

In [17]:
lprint("=============DONE!===================")



In [18]:
logfile.close()

In [19]:
print("Log file closed")

Log file closed
