Run one time only after uncommenting the pip3 command.

In [None]:
# Install the pysftp package
#!pip3 install pysftp

Install modules

In [None]:
# Import the module for SFTP
import pysftp
from pathlib import Path
import os
import zipfile
import datetime
import time

Load credentials from the user directory

In [None]:
home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
full_credentials_path = home + '/gallup_credentials.txt' # path to credentials file
        
# Retrieve credentials from local file.
with open(full_credentials_path, 'rt') as file_object:
    line_list = file_object.read().split('\n')
username = line_list[0]
password = line_list[1]

sftp_host  = 'host1.gallup.com'


Establish connection to SFTP server

In [None]:
# Accept any host key. See https://stackoverflow.com/questions/53864260/no-hostkey-for-host-found-when-connecting-to-sftp-server-with-pysftp-usi
# NOTE: After running FileZilla, the host key is now stored in the known_hosts file, so there is probably a way to run this without
# the next two lines.
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None

# Connect to SFTP server
sftp = pysftp.Connection(host=sftp_host, username=username, password=password, cnopts=cnopts)
print("Connection succesfully established ... ")

# Switch to the output directory
sftp.cwd('/out/')

# Obtain structure of the remote directory '/var/www/vhosts'
directory_structure = sftp.listdir_attr()

# Set the current local working directory to the Downloads folder
cwd = home + '/Downloads/gallup/'
os.chdir(cwd)

# Print filenames in the directory
for attr in directory_structure[0:1]:
    # Print the file name and size

    filename = attr.filename
    size = attr.st_size
    last_modified = attr.st_mtime
    # Convert the last modified time to a yyyy-mm-dd format
    last_modified = datetime.datetime.fromtimestamp(last_modified).strftime('%Y-%m-%d')
    # Convert the size to MB
    size = size / 1048576
    print('filename: ' + filename, ', size: ' + str(size), 'MB, last modified: ' + str(last_modified))

    #filename = '2018_11_19 Race Relations Survey.zip'
    extract_folder = filename[:-4] # remove the .zip extension

    print('Downloading ')
    # Save the start time
    start_time = time.time()
    # Download the file. File will be saved in the local computer current working directory
    sftp.get(filename)
    # Calculate the elapsed time
    elapsed_time = time.time() - start_time
    print('Download time: ' + str(elapsed_time) + ' seconds')

    gallup_box_directory = home + '/Library/CloudStorage/Box-Box/VU Gallup Microdata/current_download/'
    # Unzip the downloaded file and put the contents in the gallup_box_directory

    path_to_zip_file = cwd + filename
    directory_to_extract_to = gallup_box_directory 

    print('Unzipping ')
    # If the extract folder does not exist, create it
    if not os.path.exists(directory_to_extract_to + extract_folder):
        os.makedirs(directory_to_extract_to + extract_folder)

    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to + extract_folder + '/')

    # Delete the zip file
    os.remove(path_to_zip_file)

    print()

# Close connection
sftp.close()

print('done')






In [None]:
# Close connection
sftp.close()

Hangs on large files (over like 5 MB). See https://stackoverflow.com/questions/12486623/paramiko-fails-to-download-large-files-1gb

```
The SFTP protocol doesn't have a way to stream file data; instead what it has is a way to request a block of data from a particular offset in an open file. The naive method of downloading a file would be to request the first block, write it to disk, then request the second block, and so forth. This is reliable, but very slow.

Instead, Paramiko has a performance trick it uses: when you call .get() it immediately sends a request for every block in the file, and it remembers what offset they're supposed to be written to. Then as each response arrives, it makes sure it gets written to the correct offset on-disk. For more information, see the SFTPFile.prefetch() and SFTPFile.readv() methods in the Paramiko documentation. I suspect the book-keeping information it stores when downloading your 1GB file might be causing... something to run out of resources, generating your "insufficient resources" message.

Rather than using .get(), if you just call .open() to get an SFTPFile instance, then call .read() on that object, or just hand it to the Python standard library function shutil.copyfileobj() to download the contents. That should avoid the Paramiko prefetch cache, and allow you to download the file even if it's not quite as fast.

def lazy_loading_ftp_file(sftp_host_conn, filename):
    """
        Lazy loading ftp file when exception simple sftp.get call
        :param sftp_host_conn: sftp host
        :param filename: filename to be downloaded
        :return: None, file will be downloaded current directory
    """
    import shutil
    try:
        with sftp_host_conn() as host:
            sftp_file_instance = host.open(filename, 'r')
            with open(filename, 'wb') as out_file:
                shutil.copyfileobj(sftp_file_instance, out_file)
            return {"status": "sucess", "msg": "sucessfully downloaded file: {}".format(filename)}
    except Exception as ex:
        return {"status": "failed", "msg": "Exception in Lazy reading too: {}".format(ex)}
```

In [None]:
#!pip3 install paramiko

In [None]:
import paramiko
import shutil

ssh = paramiko.SSHClient() 
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 
try:
    ssh.connect( sftp_host, username=username, password=password, timeout=5.0) 
    #transport.connect(username=username, password=password ) 
except:
    print( "Connection Failed")
    logger.error( "Failed to connect to the remote server. Reason: %s" % ( str(err) ) )


In [None]:
try:
    #sftp = paramiko.SFTPClient.from_transport(transport)
    sftp = ssh.open_sftp() 
except:
    print('sftp error')
    logger.error( "Failed to start SFTP session from connection to %s. Check that SFTP service is running and available. Reason: %s" % ( hostname, str(err) ))

# Switch to the output directory
src_dir_path = '/out/'
sftp.chdir(src_dir_path)

# Obtain structure of the remote directory '/var/www/vhosts'
directory_structure = sftp.listdir_attr()

# Set the current local working directory to the Downloads folder
cwd = home + '/Downloads/gallup/'
os.chdir(cwd)

# Print filenames in the directory
file_list = sftp.listdir()
print(file_list)



In [None]:
# Get the file "2018_11_19 Race Relations Survey.zip"
# This is the command that doesn't work when the file size is over about 5 Mb
sftp.get('2018_11_19 Race Relations Survey.zip', '2018_11_19 Race Relations Survey.zip')

In [None]:
# This command works for files of any size
filename = '2018_11_19 Race Relations Survey.zip'
sftp_file_instance = sftp.open(filename, 'r')
with open(filename, 'wb') as out_file:
    shutil.copyfileobj(sftp_file_instance, out_file)


In [None]:
sftp.close()

# Final script

Build out the script based on the earlier one, but using the SFTP `copyfileobj`` method that actually works with big files.

In [None]:
# gallup_downloader.py - Downloads Gallup microdata files from the Gallup SFTP server.
# (c) 2023 Vanderbilt University. This program is released under a GNU General Public License v3.0 http://www.gnu.org/licenses/gpl-3.0
# Author: Steve Baskauf
# Date: 2020-11-22

# The credentials are provided by Gallup. They need to be stored in a plain text file stored
# in the user's home directory using the name specified in the CREDENTIALS_FILENAME constant.
# The first line of the file should be the username and the second line should be the password.
# There should be no leading or trailing spaces on the lines. It doesn't matter whether there
# is a trailing newline at the end of the file.

# For code development notes, see dev_gallup_downloader.ipynb in the same directory as this file.

# Imports
from pathlib import Path
import os
import pandas as pd
import paramiko
import datetime
import time
import shutil
import zipfile

# Constants
SFTP_HOST  = 'host1.gallup.com'
LOCAL_DOWNLOAD_DIRECTORY_RELATIVE_TO_HOME = '/Downloads/gallup/' # create before running
REMOTE_DIRECTORY_WITH_FILES_TO_DOWNLOAD = '/out/'
CREDENTIALS_FILENAME = 'gallup_credentials.txt'
GALLUP_BOX_DIRECTORY = '/Library/CloudStorage/Box-Box/VU Gallup Microdata/current_download/' # create before running

# Load credentials from file in user's home directory.
home = str(Path.home()) # gets path to home directory; works for both Win and Mac
full_credentials_path = home + '/' + CREDENTIALS_FILENAME
        
# Retrieve credentials from local file.
with open(full_credentials_path, 'rt') as file_object:
    line_list = file_object.read().split('\n')
username = line_list[0]
password = line_list[1]

# Set the current local working directory to the local download folder.
cwd = home + LOCAL_DOWNLOAD_DIRECTORY_RELATIVE_TO_HOME
os.chdir(cwd)

# Open the CSV file containing the record from the last download as a Pandas dataframe.
last_download_csv_path = home + GALLUP_BOX_DIRECTORY + '/gallup_last_download.csv'
# Check whether the file exists
if not os.path.exists(last_download_csv_path):
    # Create the file and write the header row
    with open(last_download_csv_path, 'wt') as file_object:
        file_object.write('filename,size_in_mb,last_modified_iso\n')

# Read the file into a dataframe
last_download_df = pd.read_csv(last_download_csv_path)
# Set the filename column as the index
last_download_df = last_download_df.set_index('filename')
#print(last_download_df)

# Establish a connection with the SFTP server.
# See example at https://stackoverflow.com/questions/12486623/paramiko-fails-to-download-large-files-1gb
ssh = paramiko.SSHClient() 
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
    ssh.connect(SFTP_HOST, username=username, password=password, timeout=5.0)
    print('Connection succesfully established with server ... ')
except:
    print( 'Connection with server failed')
    # End the program.
    sys.exit(1)

# Initiate an SFTP session.
try:
    sftp = ssh.open_sftp()
    print('SFTP session succesfully established ... ')
except:
    print('Failed to start SFTP session')
    # End the program.
    sys.exit(1)
print()

# Switch to the remote directory holding the files to download (relative to /var/www/vhosts).
sftp.chdir(REMOTE_DIRECTORY_WITH_FILES_TO_DOWNLOAD)

# Print filenames in the directory.
file_list = sftp.listdir()
#print(file_list)

# Obtain structure of the remote directory.
directory_structure = sftp.listdir_attr()
#print(directory_structure)

# Process each file in the directory.
for attr in directory_structure[0:2]:
    # Print the file name, size, and last modified date.
    filename = attr.filename
    size_in_bytes = attr.st_size
    last_modified_raw_time = attr.st_mtime
    # Convert the last modified time to a yyyy-mm-dd format.
    last_modified_iso = datetime.datetime.fromtimestamp(last_modified_raw_time).strftime('%Y-%m-%d')
    # Convert the size to MB, rounded to the nearest 0.1 MB
    size_in_mb = round(size_in_bytes / 1048576, 1)

    # Check whether the file has already been downloaded.
    if filename in last_download_df.index:
        # Check whether the file size and last modified date are the same as in the last download.
        if size_in_mb == last_download_df.loc[filename, 'size_in_mb'] and last_modified_iso == last_download_df.loc[filename, 'last_modified_iso']:
            # Skip this file.
            print('Skipping ' + filename + ' because it has already been downloaded.')
            continue

    print('filename: ' + filename, ', size: ' + str(size_in_mb), 'MB, last modified: ' + str(last_modified_iso))

    # Download the zip file.
    print('Downloading ...')
    # Save the start time.
    start_time = time.time()
    # Download the file to the previously established remote and local directories.
    sftp_file_instance = sftp.open(filename, 'r')
    with open(filename, 'wb') as out_file:
        shutil.copyfileobj(sftp_file_instance, out_file)
    
    # Calculate the elapsed time.
    elapsed_time = round(time.time() - start_time, 1)
    print('Download time: ' + str(elapsed_time) + ' seconds')

    # Set up local folder to hold the extracted files.
    extract_folder = filename[:-4] # remove the .zip extension
    extract_path = home + GALLUP_BOX_DIRECTORY + extract_folder + '/'
    # If the extract folder does not exist, create it.
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)

    # Unzip the downloaded file and put the contents in the gallup_box_path
    path_to_zip_file = cwd + filename
    print('Unzipping ...')
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print('Unzipping complete.')

    # Delete the zip file
    os.remove(path_to_zip_file)

    print()

    # Update the last download dataframe.
    last_download_df.loc[filename, 'size_in_mb'] = size_in_mb
    last_download_df.loc[filename, 'last_modified_iso'] = last_modified_iso
    # Save the dataframe to the CSV file.
    last_download_df.to_csv(last_download_csv_path) # By default, the index is saved as a column.

print('File downloads complete.')

# Close connection
sftp.close()
print('Connection closed ...')

print('done')
