# Acquire
The first 8 cells of code were given to me by my instructor since I had no knowledge of web scraping yet. The rest of the code is my own work as I find a way to get my data in one local source.

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# The web page that will be scraped:
url = "https://atmos.nmsu.edu/PDS/data/PDS4/InSight/twins_bundle/data_raw/sol_0000_0122/"

In [3]:
# Requesting page information
response = get(url)

In [4]:
# Turning page information into chunks of soup
soup = BeautifulSoup(response.text)

In [5]:
# The csv files I'm after are under anchor tags, 
# so I'll look for those specifically
links = soup.find_all("a")
links

[<a href="?C=N;O=D">Name</a>,
 <a href="?C=M;O=A">Last modified</a>,
 <a href="?C=S;O=A">Size</a>,
 <a href="?C=D;O=A">Description</a>,
 <a href="/PDS/data/PDS4/InSight/twins_bundle/data_raw/">Parent Directory</a>,
 <a href="twins_config_0004_01.csv">twins_config_0004_01.csv</a>,
 <a href="twins_config_0004_01.xml">twins_config_0004_01.xml</a>,
 <a href="twins_config_0005_01.csv">twins_config_0005_01.csv</a>,
 <a href="twins_config_0005_01.xml">twins_config_0005_01.xml</a>,
 <a href="twins_config_0010_01.csv">twins_config_0010_01.csv</a>,
 <a href="twins_config_0010_01.xml">twins_config_0010_01.xml</a>,
 <a href="twins_config_0014_01.csv">twins_config_0014_01.csv</a>,
 <a href="twins_config_0014_01.xml">twins_config_0014_01.xml</a>,
 <a href="twins_config_0015_01.csv">twins_config_0015_01.csv</a>,
 <a href="twins_config_0015_01.xml">twins_config_0015_01.xml</a>,
 <a href="twins_config_0016_01.csv">twins_config_0016_01.csv</a>,
 <a href="twins_config_0016_01.xml">twins_config_0016_01.xm

In [6]:
# Creating a list of all the listed file names written in the "href"
links = [link["href"] for link in links]

In [7]:
links

['?C=N;O=D',
 '?C=M;O=A',
 '?C=S;O=A',
 '?C=D;O=A',
 '/PDS/data/PDS4/InSight/twins_bundle/data_raw/',
 'twins_config_0004_01.csv',
 'twins_config_0004_01.xml',
 'twins_config_0005_01.csv',
 'twins_config_0005_01.xml',
 'twins_config_0010_01.csv',
 'twins_config_0010_01.xml',
 'twins_config_0014_01.csv',
 'twins_config_0014_01.xml',
 'twins_config_0015_01.csv',
 'twins_config_0015_01.xml',
 'twins_config_0016_01.csv',
 'twins_config_0016_01.xml',
 'twins_config_0017_01.csv',
 'twins_config_0017_01.xml',
 'twins_config_0018_01.csv',
 'twins_config_0018_01.xml',
 'twins_config_0019_01.csv',
 'twins_config_0019_01.xml',
 'twins_config_0020_01.csv',
 'twins_config_0020_01.xml',
 'twins_config_0021_01.csv',
 'twins_config_0021_01.xml',
 'twins_config_0022_01.csv',
 'twins_config_0022_01.xml',
 'twins_config_0023_01.csv',
 'twins_config_0023_01.xml',
 'twins_config_0024_01.csv',
 'twins_config_0024_01.xml',
 'twins_config_0025_01.csv',
 'twins_config_0025_01.xml',
 'twins_config_0026_01.csv',

### TODO
1. Iterate throught the list of strings, if the string contains ".csv" then append it to a new list

Our goal is to make a list that looks like the following:

```python
[
    "twins_config_0004_01.csv",
    "twins_config_0005_01.csv",
    "twins_config_0010_01.csv"
    ...
]
```

2. Once we have a list of filenames like above, we can make a for loop to send requests w/ `pd.read_csv` and then write each csv file to disk. (this will produce a local copy of each csv)
    - consider the following:
        ```python
        for link in links:
            # Read from the internet
            df = pd.read_csv(url + link)
            
            # Write locally to disk
            df.to_csv(link) 
        ```

3. Finally, make sure to concatenate all the csvs together. If you write the one big csv to disk, be sure to add it to your gitignore so you won't have a single file too large for GitHub.

In [8]:
# Delete garbage in the list of links so we only have the .csv names
new_list= []

for link in links:
    if link.endswith('.csv'): # If the link ends in .csv, add it to new_list
        new_list.append(link)

    else: # If link does not end in .csv, move on
        continue

links = new_list

In [9]:
new_list

['twins_config_0004_01.csv',
 'twins_config_0005_01.csv',
 'twins_config_0010_01.csv',
 'twins_config_0014_01.csv',
 'twins_config_0015_01.csv',
 'twins_config_0016_01.csv',
 'twins_config_0017_01.csv',
 'twins_config_0018_01.csv',
 'twins_config_0019_01.csv',
 'twins_config_0020_01.csv',
 'twins_config_0021_01.csv',
 'twins_config_0022_01.csv',
 'twins_config_0023_01.csv',
 'twins_config_0024_01.csv',
 'twins_config_0025_01.csv',
 'twins_config_0026_01.csv',
 'twins_config_0027_01.csv',
 'twins_config_0028_01.csv',
 'twins_config_0029_01.csv',
 'twins_config_0030_01.csv',
 'twins_config_0032_01.csv',
 'twins_config_0033_01.csv',
 'twins_config_0034_01.csv',
 'twins_config_0035_01.csv',
 'twins_config_0036_01.csv',
 'twins_config_0037_01.csv',
 'twins_config_0038_01.csv',
 'twins_config_0039_01.csv',
 'twins_config_0040_01.csv',
 'twins_config_0041_01.csv',
 'twins_config_0042_01.csv',
 'twins_config_0043_01.csv',
 'twins_config_0044_01.csv',
 'twins_config_0045_01.csv',
 'twins_config

In [10]:
len(links)

288

In [11]:
# # Create a local copy of all the .csv files from the website

# for link in links:
    
#     # Read from the internet
#     df = pd.read_csv(url + link)

#     # Write locally to disk
#     df.to_csv(link) 

In [12]:
# What does one file look like? I gotta know before building a dataframe
example = pd.read_csv(links[0])
example

Unnamed: 0.1,Unnamed: 0,AOBT,SCLK,LMST,LTST,UTC,BMY_WIND_CONF_TABLE,BMY_GENERAL_CFG,BMY_AD_CONVERTER_CFG,BMY_MISCELLANEOUS_CFG,...,BPY_WD_IX_8,BPY_WD_IX_9,BPY_WD_IX_10,BPY_WD_IX_11,BPY_WD_IX_12,BPY_BAND_CFG,BPY_TST_CFG,BPY_STATUS,BPY_AD_OFFSET_REG2,BPY_AD_OFFSET_REG1
0,0,596876692.0,596860900.0,00004M06:42:20.782,00004 06:01:28,2018-334T14:42:35.755Z,ZERO,0x00,0x00,0x00,...,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0.0,0.0
1,1,596876952.0,596861200.0,00004M06:46:33.826,00004 06:05:41,2018-334T14:46:55.755Z,UNK,0x40,0x00,0x01,...,,,,,,,,,,
2,2,596876953.0,596861200.0,00004M06:46:34.799,00004 06:05:42,2018-334T14:46:56.755Z,FM1 MARS DEFAULT V0 CONF -30,0x40,0x00,0x01,...,,,,,,,,,,
3,3,596877041.0,596861300.0,00004M06:48:00.445,00004 06:07:07,2018-334T14:48:24.755Z,,,,,...,0x5e,0x62,0x5d,0x59,0x6d,0x0f,0x00,0x03,0.0,101.0
4,4,596877042.0,596861300.0,00004M06:48:01.418,00004 06:07:08,2018-334T14:48:25.755Z,,,,,...,0x61,0x65,0x60,0x5b,0x6f,0x0f,0x00,0x03,0.0,98.0
5,5,596877043.0,596861300.0,00004M06:48:02.391,00004 06:07:09,2018-334T14:48:26.755Z,,,,,...,0x61,0x65,0x61,0x5b,0x6f,0x0f,0x00,0x03,0.0,102.0
6,6,596877078.0,596861300.0,00004M06:48:36.455,00004 06:07:43,2018-334T14:49:01.755Z,ZERO,0x00,0x00,0x00,...,0x00,0x00,0x00,0x00,0x00,0x08,0x00,0x03,0.0,0.0
7,7,596877079.0,596861300.0,00004M06:48:37.428,00004 06:07:44,2018-334T14:49:02.755Z,FM1 MARS DEFAULT V0 CONF -60,0x00,0x00,0x00,...,0x59,0x60,0x5a,0x56,0x69,0x0f,0x00,0x03,0.0,0.0
8,8,596877080.0,596861300.0,00004M06:48:38.401,00004 06:07:45,2018-334T14:49:03.755Z,,,,,...,0x58,0x61,0x5a,0x56,0x69,0x0f,0x00,0x03,0.0,0.0
9,9,596877095.0,596861300.0,00004M06:48:53.000,00004 06:08:00,2018-334T14:49:18.755Z,ZERO,0x00,0x00,0x00,...,,,,,,,,,,


In [13]:
# I now see that there are 3 different data sets. I will separate these for now to simplify 
# for a Minimum Viable Product (MVP)
twins_config_list = []
twins_raw_list = []
twins_rawevent_list = []

for link in links:
    if link.startswith("twins_config"):
        twins_config_list.append(link)
    elif link.startswith("twins_rawevent"):
        twins_rawevent_list.append(link)
    elif link.startswith("twins_raw"):
        twins_raw_list.append(link)
    else:
        continue

In [19]:
len(twins_config_list)

105

In [20]:
len(twins_raw_list)

105

In [21]:
len(twins_rawevent_list)

78

In [22]:
# Instead of combining all dataframes, I will instead focus on just raw for now
# Combinng all Twins_raw csv files into a SUPER file!

# Creating the first Dataframe 
df = pd.DataFrame()

for link in twins_raw_list: 

    # Append file to dataframe
    df = df.append(pd.DataFrame(pd.read_csv(link)))


In [17]:
# # Combine all csv files into a SUPER file!

# # Creating the first Dataframe 
# df = pd.DataFrame()

# for link in links: 

#     # Append file to dataframe
#     df = df.append(pd.DataFrame(pd.read_csv(link)))


In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,AOBT,SCLK,LMST,LTST,UTC,BMY_2L_TEMP_1,BMY_2L_TEMP_2,BMY_2L_TEMP_3,BMY_2L_TEMP_4,...,BPY_WD_OUT_6,BPY_WD_OUT_7,BPY_WD_OUT_8,BPY_WD_OUT_9,BPY_WD_OUT_10,BPY_WD_OUT_11,BPY_WD_OUT_12,BPY_WIND_FREQUENCY,BPY_AIR_TEMP_FREQUENCY,BPY_ASIC_TEMP
0,0,596876952.0,596861200.0,00004M06:46:33.826,00004 06:05:41,2018-334T14:46:55.755Z,-4353.0,-4645.0,-4778.0,-5006.0,...,,,,,,,,,,
1,1,596876953.0,596861200.0,00004M06:46:34.799,00004 06:05:42,2018-334T14:46:56.755Z,-4415.0,-4723.0,-4870.0,-5099.0,...,,,,,,,,,,
2,2,596876954.0,596861200.0,00004M06:46:35.772,00004 06:05:43,2018-334T14:46:57.755Z,-4424.0,-4727.0,-4861.0,-5107.0,...,,,,,,,,,,
3,3,596876955.0,596861200.0,00004M06:46:36.746,00004 06:05:44,2018-334T14:46:58.755Z,-4409.0,-4722.0,-4859.0,-5109.0,...,,,,,,,,,,
4,4,596876956.0,596861200.0,00004M06:46:37.719,00004 06:05:45,2018-334T14:46:59.755Z,-4424.0,-4728.0,-4873.0,-5107.0,...,,,,,,,,,,


In [23]:
df.shape

(2350436, 64)

In [25]:
# Dropping the additional index column 
df= df.drop('Unnamed: 0', axis=1)

In [26]:
df.head()

Unnamed: 0,AOBT,SCLK,LMST,LTST,UTC,BMY_2L_TEMP_1,BMY_2L_TEMP_2,BMY_2L_TEMP_3,BMY_2L_TEMP_4,BMY_2L_TEMP_4_AVERAGE,...,BPY_WD_OUT_6,BPY_WD_OUT_7,BPY_WD_OUT_8,BPY_WD_OUT_9,BPY_WD_OUT_10,BPY_WD_OUT_11,BPY_WD_OUT_12,BPY_WIND_FREQUENCY,BPY_AIR_TEMP_FREQUENCY,BPY_ASIC_TEMP
0,596876952.0,596861200.0,00004M06:46:33.826,00004 06:05:41,2018-334T14:46:55.755Z,-4353.0,-4645.0,-4778.0,-5006.0,,...,,,,,,,,,,
1,596876953.0,596861200.0,00004M06:46:34.799,00004 06:05:42,2018-334T14:46:56.755Z,-4415.0,-4723.0,-4870.0,-5099.0,,...,,,,,,,,,,
2,596876954.0,596861200.0,00004M06:46:35.772,00004 06:05:43,2018-334T14:46:57.755Z,-4424.0,-4727.0,-4861.0,-5107.0,,...,,,,,,,,,,
3,596876955.0,596861200.0,00004M06:46:36.746,00004 06:05:44,2018-334T14:46:58.755Z,-4409.0,-4722.0,-4859.0,-5109.0,,...,,,,,,,,,,
4,596876956.0,596861200.0,00004M06:46:37.719,00004 06:05:45,2018-334T14:46:59.755Z,-4424.0,-4728.0,-4873.0,-5107.0,,...,,,,,,,,,,


In [None]:
# Now to write the combined csv dataframe to a single file
df.to_csv('INSIGHT_DATA.csv')

In [28]:
# Now to write the combined csv dataframe to a single file
df.to_csv('INSIGHT_TWINS_RAW.csv')

In [27]:
# Finally, I'll create a function for my wrangle file to do all this for me again if need be


def web_scrape_all():
    """
    This function takes time to process. The function requests a response from a specific url with a list of
    files to download. This function collects a list of the names of all the csv files, uses this list to 
    download all the csv's individually, and then combines all the csv's into one file named "INSIGHT_DATA.csv".
    This function will also return the new dataframe.
    """
    # Informing the User
    print("Requesting Data...")  
    
    # The web page that will be scraped:
    url = "https://atmos.nmsu.edu/PDS/data/PDS4/InSight/twins_bundle/data_raw/sol_0000_0122/"
    
    # Requesting page information
    response = get(url)
    
    # Turning page information into chunks of soup
    soup = BeautifulSoup(response.text)
    
    # The csv file names I'm after are under anchor tags
    links = soup.find_all("a")
    
    # Creating a list of all the listed file names written in the "href"
    links = [link["href"] for link in links]
    
    # Delete garbage in the list of links so we only have the .csv names
    temporary_list= []
    for link in links:
        if link.endswith('.csv'): # If the link ends in .csv, add it to new_list
            temporary_list.append(link)

        else: # If link does not end in .csv, move on
            continue
    links = temporary_list
    
    # Informing the User
    print("Creating local copies...")
    
    # Create a local copy of all the .csv files from the website
    for link in links:
        # Read from the internet
        df = pd.read_csv(url + link)
        # Write locally to disk
        df.to_csv(link)
    
    # Informing the User
    print("Combining csv files...")
    
    # Combine all csv files into a SUPER file!
    # Creating the first Dataframe 
    df = pd.DataFrame()
    for link in links: 
        # Append file to dataframe
        df = df.append(pd.DataFrame(pd.read_csv(link)))
    
    # Dropping the additional index column 
    df= df.drop('Unnamed: 0', axis=1)
    
    # Informing the User
    print("Creating super file...")
    
    # Now to write the combined csv dataframe to a single file
    df.to_csv('INSIGHT_DATA.csv')
    
    # Informing the User
    print("'INSIGHT_DATA.csv' created.")
    
    return df

In [None]:
def twins_raw_data():
    """
    This function takes time to process, and only scrapes data for TWINS_RAW. The function requests a response from a specific url with a list of
    files to download. This function collects a list of the names of all the csv files, uses this list to 
    download all the csv's individually, and then combines all the csv's into one file named "INSIGHT_TWINS_RAW.csv".
    This function will also return the new dataframe.
    """
    # Informing the User
    print("Requesting Data...")  
    
    # The web page that will be scraped:
    url = "https://atmos.nmsu.edu/PDS/data/PDS4/InSight/twins_bundle/data_raw/sol_0000_0122/"
    
    # Requesting page information
    response = get(url)
    
    # Informing the User
    print("Organizing list of file names...") 

    # Turning page information into chunks of soup
    soup = BeautifulSoup(response.text)
    
    # The csv file names I'm after are under anchor tags
    links = soup.find_all("a")
    
    # Creating a list of all the listed file names written in the "href"
    links = [link["href"] for link in links]
    
    # Delete garbage in the list of links so we only have the .csv names
    temporary_list= []
    for link in links:
        if link.endswith('.csv'): # If the link ends in .csv, add it to new_list
            temporary_list.append(link)

        else: # If link does not end in .csv, move on
            continue
    links = temporary_list
    
    # Separating "twins_raw" data from list
    twins_raw_list = []
    for link in links:
        if link.startswith("twins_rawevent"):
            continue
        elif link.startswith("twins_raw"):
            twins_raw_list.append(link)
        else:
            continue
            
    # Informing the User
    print("Combining twins_raw csv files...")
    
    # Combining all Twins_raw csv files into a SUPER file!
    # Creating the first Dataframe 
    df = pd.DataFrame()

    for link in twins_raw_list: 
        # Append file to dataframe
        df = df.append(pd.DataFrame(pd.read_csv(link)))

    # Dropping the additional index column 
    df= df.drop('Unnamed: 0', axis=1)
    
    # Informing the User
    print("Creating single file...")
    
    # Now to write the combined csv dataframe to a single file
    df.to_csv('INSIGHT_TWINS_RAW.csv')
    
    # Informing the User
    print("'INSIGHT_TWINS_RAW.csv' created.")
    
    return df