In [30]:
import pandas as pd
from ftplib import FTP
from datetime import datetime

In [121]:
def get_data_list():
    """
    Input: None
    Output: List of all files available on STMTA FTP Server
    """    
    ftp = FTP('avl-data.sfmta.com') 
    ftp.login()
    ftp.cwd('AVL_DATA/AVL_RAW/')
    files = []
    ftp.retrlines('NLST', files.append)
    
    return files

In [122]:
a = get_data_list()
a

['lookUpBlockIDToBlockNumNam.csv',
 'lookUpSignUpPeriods.csv',
 'Pre_08152012',
 'read_me.txt',
 'sfmtaAVL04042012_04112012.zip',
 'sfmtaAVLRawData01012013.csv',
 'sfmtaAVLRawData01012014.csv',
 'sfmtaAVLRawData01012015.csv',
 'sfmtaAVLRawData01022013.csv',
 'sfmtaAVLRawData01022014.csv',
 'sfmtaAVLRawData01022015.csv',
 'sfmtaAVLRawData01032013.csv',
 'sfmtaAVLRawData01032014.csv',
 'sfmtaAVLRawData01032015.csv',
 'sfmtaAVLRawData01032016.csv',
 'sfmtaAVLRawData01042013.csv',
 'sfmtaAVLRawData01042014.csv',
 'sfmtaAVLRawData01042015.csv',
 'sfmtaAVLRawData01042016.csv',
 'sfmtaAVLRawData01052013.csv',
 'sfmtaAVLRawData01052014.csv',
 'sfmtaAVLRawData01052015.csv',
 'sfmtaAVLRawData01052016.csv',
 'sfmtaAVLRawData01062013.csv',
 'sfmtaAVLRawData01062014.csv',
 'sfmtaAVLRawData01062015.csv',
 'sfmtaAVLRawData01062016.csv',
 'sfmtaAVLRawData01072013.csv',
 'sfmtaAVLRawData01072014.csv',
 'sfmtaAVLRawData01072015.csv',
 'sfmtaAVLRawData01072016.csv',
 'sfmtaAVLRawData01082013.csv',
 'sfmt

In [124]:
def clean_sort_file_list(file_list):
    
    """
    Input: A list of FTP file
    Output: A list of csv file dates, sorted with newest first
    """

    clean_dates = []

    for item in file_list:

        if item[0:11] != 'sfmtaAVLRaw':
            continue

        raw_date = item[15:-4]

        if raw_date[0] == '_':
            continue

        file_time_raw = datetime.strptime(raw_date, '%m%d%Y')

        clean_dates.append(file_time_raw)
        
        clean_dates.sort(reverse=True)
    
    return clean_dates

In [125]:
b = clean_sort_file_list(a)
b

[datetime.datetime(2016, 12, 5, 0, 0),
 datetime.datetime(2016, 12, 4, 0, 0),
 datetime.datetime(2016, 12, 3, 0, 0),
 datetime.datetime(2016, 11, 23, 0, 0),
 datetime.datetime(2016, 11, 22, 0, 0),
 datetime.datetime(2016, 11, 21, 0, 0),
 datetime.datetime(2016, 11, 20, 0, 0),
 datetime.datetime(2016, 11, 19, 0, 0),
 datetime.datetime(2016, 11, 18, 0, 0),
 datetime.datetime(2016, 11, 17, 0, 0),
 datetime.datetime(2016, 11, 16, 0, 0),
 datetime.datetime(2016, 11, 15, 0, 0),
 datetime.datetime(2016, 11, 14, 0, 0),
 datetime.datetime(2016, 11, 13, 0, 0),
 datetime.datetime(2016, 11, 12, 0, 0),
 datetime.datetime(2016, 11, 11, 0, 0),
 datetime.datetime(2016, 11, 10, 0, 0),
 datetime.datetime(2016, 11, 9, 0, 0),
 datetime.datetime(2016, 11, 8, 0, 0),
 datetime.datetime(2016, 11, 7, 0, 0),
 datetime.datetime(2016, 11, 6, 0, 0),
 datetime.datetime(2016, 11, 5, 0, 0),
 datetime.datetime(2016, 11, 4, 0, 0),
 datetime.datetime(2016, 11, 3, 0, 0),
 datetime.datetime(2016, 11, 2, 0, 0),
 datetime.d

In [110]:
def get_gtfs_file(datetime, gtfs_lookup_df):
    
    for gtfs_file in gtfs_lookup_df.iterrows():
        
        from_raw = gtfs_file[1]['from_date']
        from_date = datetime.strptime(from_raw, '%Y-%m-%d')
        
        to_raw = gtfs_file[1]['to_date']
        to_date = datetime.strptime(to_raw, '%Y-%m-%d')
        
        if from_date <= datetime < to_date:
            
            return gtfs_file[1]['directory']
        
    return None
    

In [114]:
def build_df(clean_list, gtfs_lookup_df):
    
    """
    Input: 
        -List of clean, sorted files constrained by date
        -Dataframe of GTFS directory and their effective date ranges
    Output:
        DataFrame with
            -Timestamp of Data Date
            -ISO string of Data Date
            -String of FTP file name
            -String of local GTFS Data Location 
    """
    
    pre_df = []
    
    for item in clean_list:
        
        ftp_filename = 'sfmtaAVLRawData' + item.strftime('%m%d%Y') + '.csv'
        
        iso_string = item.strftime('%Y-%m-%d')
        
        time_stamp = item.timestamp()
        
        directory = get_gtfs_file(item, gtfs_lookup_df)
        
        pre_df.append([ftp_filename, iso_string, time_stamp, directory])
        
    df = pd.DataFrame(pre_df, columns=['ftp_filename', 'iso_string', 'time_stamp', 'gtfs_directory'])
    
    return df

In [130]:
gtfs_plz = pd.read_csv('data/gtfs_lookup.csv')

c = b[:10]

d = build_df(c, gtfs_plz)

d

Unnamed: 0,ftp_filename,iso_string,time_stamp,gtfs_directory
0,sfmtaAVLRawData12052016.csv,2016-12-05,1480925000.0,sfmta_2017-02-10
1,sfmtaAVLRawData12042016.csv,2016-12-04,1480838000.0,sfmta_2017-02-10
2,sfmtaAVLRawData12032016.csv,2016-12-03,1480752000.0,sfmta_2017-02-10
3,sfmtaAVLRawData11232016.csv,2016-11-23,1479888000.0,sfmta_2017-02-10
4,sfmtaAVLRawData11222016.csv,2016-11-22,1479802000.0,sfmta_2017-02-10
5,sfmtaAVLRawData11212016.csv,2016-11-21,1479715000.0,sfmta_2017-02-10
6,sfmtaAVLRawData11202016.csv,2016-11-20,1479629000.0,sfmta_2017-02-10
7,sfmtaAVLRawData11192016.csv,2016-11-19,1479542000.0,sfmta_2017-02-10
8,sfmtaAVLRawData11182016.csv,2016-11-18,1479456000.0,sfmta_2017-02-10
9,sfmtaAVLRawData11172016.csv,2016-11-17,1479370000.0,sfmta_2017-02-10


In [131]:
def x_recent_days(num_days):
    
    """
    Input: Number of Days of Data Desired
    Output: DataFrame file, time and gtfs data
          
    MAX DAYS AVAILABLE: 1529
    """
    
    gtfs_lookup_df = pd.read_csv('data/gtfs_lookup.csv')
    
    files = get_data_list()

    cln_srted = clean_sort_file_list(files)
    
    selection = cln_srted[:num_days]
    
    df = build_df(selection, gtfs_lookup_df)
    
    return df
  

In [118]:
working_df = x_recent_days(20)

In [119]:
working_df

Unnamed: 0,ftp_filename,iso_string,time_stamp,gtfs_directory
0,sfmtaAVLRawData12052016.csv,2016-12-05,1480925000.0,sfmta_2017-02-10
1,sfmtaAVLRawData12042016.csv,2016-12-04,1480838000.0,sfmta_2017-02-10
2,sfmtaAVLRawData12032016.csv,2016-12-03,1480752000.0,sfmta_2017-02-10
3,sfmtaAVLRawData11232016.csv,2016-11-23,1479888000.0,sfmta_2017-02-10
4,sfmtaAVLRawData11222016.csv,2016-11-22,1479802000.0,sfmta_2017-02-10
5,sfmtaAVLRawData11212016.csv,2016-11-21,1479715000.0,sfmta_2017-02-10
6,sfmtaAVLRawData11202016.csv,2016-11-20,1479629000.0,sfmta_2017-02-10
7,sfmtaAVLRawData11192016.csv,2016-11-19,1479542000.0,sfmta_2017-02-10
8,sfmtaAVLRawData11182016.csv,2016-11-18,1479456000.0,sfmta_2017-02-10
9,sfmtaAVLRawData11172016.csv,2016-11-17,1479370000.0,sfmta_2017-02-10
