# Part 1

# Downloading the Dataset
## To run this project you will need to install libraries given below:-
* `pip install selenium`
* `pip install pyautogui`
* `pip install panel`
* `pip install geopy`
* `pip install pathlib`
* `pip install plotly`
* `pip install matplotlib`
## and then download the webdriver of Chrome browser and adding it to Path. Link for web driver given below:-
## https://sites.google.com/a/chromium.org/chromedriver/home
## For adding it the directory to the path:-

### If you have Windows:-
1. open cmd and navigate to the downloaded folder's location
2. Type this command at the loaction `set PATH=%PATH%;C:\your\path\here\`

### If you have Linux:-
1. `nano ~/.bashrc`
2. `export PATH="$HOME/path/to/folder:$PATH"`
3. `source ~/.bashrc`

### If you have Mac:-
1. Open up Terminal.
2. Run the following command:
3. sudo nano /etc/paths
4. Enter your password, when prompted.
5. Go to the bottom of the file, and enter the path you wish to add.
6. Hit control-x to quit.
7. Enter “Y” to save the modified buffer.
8. That’s it!  

### It's recommended to close all the terminal sessions, logout and then log back in for the change in Path variable to take effect
### It may take more than 30 minutes to run the entire project.

In [1]:
import requests
from time import sleep
from pathlib import Path
import os
import shutil
import zipfile
from tqdm import tqdm
download_dir = 'Project_Dataset'  # download directory
if not os.path.isdir(download_dir):
    os.mkdir(download_dir)
db_dir = 'Databases' # Database directory
if not os.path.isdir(db_dir):
    os.mkdir(db_dir)


In [None]:
try:
    import selenium
    from selenium import webdriver
except Exception as e:
    print('Do "pip install selenium" and run again')
try:
    print(f'Downloaded file will be saved here -> {download_dir}')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless') # Running chrome headless
    driver = webdriver.Chrome(options=chrome_options) 
    driver.get("https://s3.amazonaws.com/tripdata/index.html") # website
    sleep(5) # let the website load
    files_to_download = ['2019' + str(i).zfill(2) for i in range(1,12)]
    files_to_download += ['2020' + str(i).zfill(2) for i in range(1,12)]
    #files_to_download += ['201512', '201612', '201712', '201812']
    elems = driver.find_elements_by_xpath("//a[@href]")
    for elem in elems:
        link = elem.get_attribute("href")
        for i in files_to_download:
            if i in link: # Download 2019 data
                print(f'Currently downloading this {link}')
                print('Downloading... Hold on!')
                r = requests.get(link, allow_redirects=True)
                filename = link.split('/')[-1]
                if 'JC' in filename.split('.')[0]:
                    filename = f"JC-{filename.split('-')[1]}.csv.zip"
                else:
                    filename = f"NYC-{filename.split('-')[0]}.csv.zip"
                if not os.path.isfile(download_dir + '/' + filename):
                    open(download_dir + '/' + filename, 'wb').write(r.content)
                    print('Downloaded!\nMoving on to the next one :)')
                else:
                    print('File already exists.\nSkipping it.')

    print('All files Downloaded :)')
    driver.quit()
except Exception as e:
    print(e)
    print('Download Webdriver from the link given below and add it to the PATH')
    print('https://sites.google.com/a/chromium.org/chromedriver/home')
    print('After adding it to PATH close the entire Jupyter notebook and stop everything then run again')
    

# Sorting the downloaded file.

In [None]:
print('Sorting Begins!')
#dirs = ['2019_DATA_JC', '2019_DATA_NYC', '2020_DATA_JC', '2020_DATA_NYC', 'WINTER_DATA_JC', 'WINTER_DATA_NYC']
dirs = ['2019_DATA_JC', '2019_DATA_NYC', '2020_DATA_JC', '2020_DATA_NYC']
for i in dirs:
    if not os.path.isdir(download_dir + '/' + i):
        os.mkdir(download_dir + '/' + i)
for file in os.listdir(download_dir):
    if os.path.isfile(download_dir + '/' + file):
        if 'JC' in file:
            if '2019' in file:
                shutil.move(download_dir + '/' + file, download_dir + '/' + '2019_DATA_JC')
            elif '2020'in file:
                shutil.move(download_dir + '/' + file, download_dir + '/' + '2020_DATA_JC')
            #else:
            #    shutil.move(download_dir + '/' + file, download_dir + '/' + 'WINTER_DATA_JC')
        else:
            if '2019' in file:
                shutil.move(download_dir + '/' + file, download_dir + '/' + '2019_DATA_NYC')
            elif '2020'in file:
                shutil.move(download_dir + '/' + file, download_dir + '/' + '2020_DATA_NYC')
            #else:
            #    shutil.move(download_dir + '/' + file, download_dir + '/' + 'WINTER_DATA_NYC')
print('Sorting Finished!')

# Unzipping the downloaded file.

In [None]:
print('Unzipping Now!')
path = download_dir

for dir in tqdm(os.listdir(path)):    
    for file in tqdm(os.listdir(path + '/' + dir)):
        filepath = f'{path}/{dir}/{file}'
        if os.path.isfile(filepath):
            if not file.startswith('.'):
                zipfile.ZipFile(path + '/' + dir + '/' + file, 'r').extractall(path + '/' + dir)

for dir in tqdm(os.listdir(path)):
    for file in tqdm(os.listdir(path + '/' + dir)):
        if 'zip' in file:
            os.remove(path + '/' + dir + '/' + file)
            
for dir in os.listdir(path):    
    for file in os.listdir(path + '/' + dir):
        filepath = f'{path}/{dir}/{file}'
        if os.path.isfile(filepath):
            if not file.startswith('.'):
                if 'JC' in file.split('.')[0]:
                    filename = f"{path}/{dir}/JC-{file.split('-')[1]}.csv"
                else:
                    filename = f"{path}/{dir}/NYC-{file.split('-')[0]}.csv"
                os.rename(filepath,filename)
print('All files unzipped!\nThank You!')