In [1]:
import pandas as pd
import numpy as np
import sqlalchemy as sa
from datetime import datetime

from collections import defaultdict, OrderedDict

from simple_salesforce import Salesforce
from os import environ
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
sfusername = environ.get('sfusername')
sfpassword = environ.get('sfpassword')
sfsecret = environ.get("sfsecret")
sfinstanceurl = environ.get("sfinstanceurl")
connstr = environ.get("connstr")

In [None]:
lprint("Creating engine for %s" % connstr)
engine = sa.create_engine(connstr)

In [3]:
desiredTables = [
    #'AcademicTermEnrollment',
    'Account',
    'Contact'
]

In [4]:
soqlFilters = defaultdict(lambda: "where IsDeleted = false", {
    'AcademicTermEnrollment':'where Active__c = true and IsDeleted = false'
})

In [None]:
logfile = open("TableETL.log","w")
def lprint(val):
    outstr = "[%s] %s" % (datetime.now(), val)
    print(outstr)
    logfile.write(outstr)
    logfile.write("\n")
    logfile.flush()
    
lprint("Started...")

In [7]:
lprint("Creating sessions...")
sf = Salesforce(username=sfusername, password=sfpassword, security_token=sfsecret, instance_url=sfinstanceurl, version='57.0')
lprint("Session created!")

[2023-06-30 11:53:31.223830] Creating sessions...
[2023-06-30 11:53:31.862540] Session created!


In [8]:
metaData = {}

for tbl in desiredTables:
    fieldDescs = {}
    
    lprint("Getting metadata for %s" % tbl)
    
    tblDesc = getattr(sf, tbl).describe()
    
    for field in tblDesc['fields']:
        fieldDescs[field['name']] = {
                                'type':field['type'],
                                'length':field['length']
        }
        
    
    metaData[tbl] = fieldDescs
    


[2023-06-30 11:53:31.881475] Getting metadata for Account
[2023-06-30 11:53:32.645309] Getting metadata for Contact


In [9]:
outputData = {}

for tbl in desiredTables:
    lprint("Querying data for %s" % tbl)
    
    feilds = ", ".join(metaData[tbl].keys())
    
    soql = "select %s from %s %s" % (feilds, tbl, soqlFilters[tbl])
    
    
    lprint("Starting query:  %s" % soql)
    resp = sf.query_all(soql)
    
    outputData[tbl] = resp
    
    lprint("Finished %s" % tbl)
    

[2023-06-30 11:53:32.998278] Querying data for Account
[2023-06-30 11:53:32.998278] Starting query:  select Id, IsDeleted, MasterRecordId, Name, LastName, FirstName, Salutation, Type, RecordTypeId, ParentId, BillingStreet, BillingCity, BillingState, BillingPostalCode, BillingCountry, BillingLatitude, BillingLongitude, BillingGeocodeAccuracy, BillingAddress, ShippingStreet, ShippingCity, ShippingState, ShippingPostalCode, ShippingCountry, ShippingLatitude, ShippingLongitude, ShippingGeocodeAccuracy, ShippingAddress, Phone, Fax, Website, PhotoUrl, Industry, AnnualRevenue, NumberOfEmployees, Description, OwnerId, CreatedDate, CreatedById, LastModifiedDate, LastModifiedById, SystemModstamp, LastActivityDate, LastViewedDate, LastReferencedDate, IsCustomerPortal, PersonContactId, IsPersonAccount, PersonMailingStreet, PersonMailingCity, PersonMailingState, PersonMailingPostalCode, PersonMailingCountry, PersonMailingLatitude, PersonMailingLongitude, PersonMailingGeocodeAccuracy, PersonMailingA

In [10]:
for tbl in outputData.keys():
    lprint("%s totalRecords %d" % (tbl, outputData[tbl]['totalSize']))

[2023-06-30 11:53:53.498476] Account totalRecords 7242
[2023-06-30 11:53:53.498476] Contact totalRecords 7234


In [11]:
lprint("Turning output data into dataframes")
dataFrames = {tbl:pd.DataFrame.from_dict(outputData[tbl]['records']) for tbl in desiredTables}

for key in dataFrames.keys():
    
    
    if 'attributes' in dataFrames[key].columns:
        
        lprint("Dropping attributes from %s" % tbl)
        dataFrames[key].drop(columns=['attributes'], inplace=True)
    
    lprint("%s shape %s" % (key, dataFrames[key].shape))

[2023-06-30 11:53:53.507717] Turning output data into dataframes
[2023-06-30 11:53:54.008245] Dropping attributes from Contact
[2023-06-30 11:53:54.024208] Account shape (7242, 142)
[2023-06-30 11:53:54.025371] Dropping attributes from Contact
[2023-06-30 11:53:54.037172] Contact shape (7234, 91)


In [12]:
for key in dataFrames.keys():
    
    lprint("Converting datetimes for %s..." % key)
    
    for col in dataFrames[key].columns:
        if metaData[key][col]['type'] in ['date', 'datetime']:
            
            lprint("Converting %s to datetime..." % col)
            
            dataFrames[key][col] = pd.to_datetime(dataFrames[key][col])
            
    
    

[2023-06-30 11:53:54.044148] Converting datetimes for Account...
[2023-06-30 11:53:54.044148] Converting CreatedDate to datetime...
[2023-06-30 11:53:54.072731] Converting LastModifiedDate to datetime...
[2023-06-30 11:53:54.093175] Converting SystemModstamp to datetime...
[2023-06-30 11:53:54.113481] Converting LastActivityDate to datetime...
[2023-06-30 11:53:54.128502] Converting LastViewedDate to datetime...
[2023-06-30 11:53:54.142112] Converting LastReferencedDate to datetime...
[2023-06-30 11:53:54.156108] Converting PersonBirthdate to datetime...
[2023-06-30 11:53:54.173228] Converting PersonLastCURequestDate to datetime...
[2023-06-30 11:53:54.187105] Converting PersonLastCUUpdateDate to datetime...
[2023-06-30 11:53:54.200640] Converting PersonEmailBouncedDate to datetime...
[2023-06-30 11:53:54.214532] Converting Date_Promotion_Decision_was_Entered__pc to datetime...
[2023-06-30 11:53:54.229496] Converting Re_Entry_Date__pc to datetime...
[2023-06-30 11:53:54.244163] Convert

In [13]:
for tbl in dataFrames.keys():
    for col in dataFrames[tbl].columns:
        
        if np.count_nonzero(dataFrames[tbl][col].map(type) == OrderedDict) > 0:
            lprint("Ordered dict found in %s column %s converting..." % (tbl, col)) 
            dataFrames[tbl][col] = dataFrames[tbl][col].astype(str)
            
            maxLen = dataFrames[tbl][col].str.len().max()
            
            lprint("Updating length for %s column %s to %d..." % (tbl, col, maxLen)) 
            
            metaData[tbl][col]['length'] = maxLen

[2023-06-30 11:53:54.476445] Ordered dict found in Account column BillingAddress converting...
[2023-06-30 11:53:54.493069] Updating length for Account column BillingAddress to 224...
[2023-06-30 11:53:54.505035] Ordered dict found in Account column ShippingAddress converting...
[2023-06-30 11:53:54.520969] Updating length for Account column ShippingAddress to 205...
[2023-06-30 11:53:54.574850] Ordered dict found in Account column PersonMailingAddress converting...
[2023-06-30 11:53:54.614705] Updating length for Account column PersonMailingAddress to 232...
[2023-06-30 11:53:54.728292] Ordered dict found in Contact column MailingAddress converting...
[2023-06-30 11:53:54.762440] Updating length for Contact column MailingAddress to 232...


In [14]:
staticFields = {
                 'boolean':sa.Boolean,
                 'date':sa.DATE,
                 'datetime':sa.DATETIME,
                 'double': sa.FLOAT,
                 #'email',
                 #'id',
                 'int':sa.INT,
                 #'multipicklist',
                 #'picklist',
                 #'reference',
                 #'string',
                 'textarea':sa.TEXT
}

def getSQLTypes(tbl):
    
    sqlTypes = {}
    
    lprint("Getting SQLTypes for %s" % tbl)
    
    curMeta = metaData[tbl]
        
    for field in curMeta.keys():
        
        if curMeta[field]['type'] in staticFields.keys():
            sqlTypes[field] = staticFields[curMeta[field]['type']]()
            
        else:
            fieldLen = curMeta[field]['length']
            
            if fieldLen <= 255:
                sqlTypes[field] = sa.NVARCHAR(fieldLen)
                
            else:
                sqlTypes[field] = sa.TEXT()
            
    return sqlTypes
            
#etSQLTypes('AcademicTermEnrollment')


In [15]:
for tbl in desiredTables:
    csvTblName = "SalesForceEduCloud_%s.csv" % tbl
    lprint("Saving %s to %s" % (tbl, csvTblName))

    dataFrames[tbl].to_csv(csvTblName, index=False)

    lprint("Finished uploading %s!" % tbl)

[2023-06-30 11:53:54.965966] Saving Account to SalesForceEduCloud_Account.csv
[2023-06-30 11:53:55.446372] Finished uploading Account!
[2023-06-30 11:53:55.446372] Saving Contact to SalesForceEduCloud_Contact.csv
[2023-06-30 11:53:55.798198] Finished uploading Contact!


In [None]:
with engine.connect() as conn:

    for tbl in desiredTables:
        
        sqlTypes = getSQLTypes(tbl)
        
        
        sqlTblName = "SalesForceEduCloud_%s" % tbl
        lprint("Uploading table %s to %s/%s" % (tbl, engine.url, sqlTblName))
        
        dataFrames[tbl].to_sql(sqlTblName, conn, schema='dbo', if_exists='replace', index=False, dtype=sqlTypes)
        
        lprint("Finished uploading %s!" % tbl)

[2023-06-30 11:54:03.018188] Getting SQLTypes for Account
[2023-06-30 11:54:03.019527] Uploading Account to mssql+pyodbc://KippNOLA/SalesForceEduCloud_Account


In [None]:
lprint("=============DONE!===================")

In [None]:
logfile.close()

In [None]:
print("Log file closed")