# Import libraries

In [5]:
import pandas as pd
import requests

# Import Invasive Species Data and prepare for analysis

In [2]:
# Read the CSV file into a pandas DataFrame
all_data = pd.read_csv('data/PRESENCE_LINE_2023_10_26_120650800.csv')

# Show the properties of the DataFrame
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2683 entries, 0 to 2682
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   centroid_x                       2683 non-null   float64
 1   centroid_y                       2683 non-null   float64
 2   scientific_name                  2683 non-null   object 
 3   common_name                      2683 non-null   object 
 4   observation_date                 2683 non-null   object 
 5   number_found                     1183 non-null   float64
 6   observer                         2683 non-null   object 
 7   organization_name                1422 non-null   object 
 8   confirmed_ind                    2683 non-null   bool   
 9   species_id_method_name_array     2628 non-null   object 
 10  species_verified_by_name         1579 non-null   object 
 11  follow_up_name                   47 non-null     object 
 12  evaluation_type_name

In [42]:
# Filter the DataFrame to only include New York and Common reed grass, phragmites
filtered_data = all_data[(all_data['jurisdiction'] == 'New York') & (all_data['common_name'] == 'Common reed grass, phragmites')]

# Rename the columns to be more descriptive
filtered_data = filtered_data.rename(
    columns={'centroid_x': 'longitude', 'centroid_y': 'latitude'})

# Filter the DataFrame to only include the columns we need
filtered_data = filtered_data.loc[:, ['longitude', 'latitude',
                                      'observation_date', 'number_found', 'imap_id', 'county', 'uuid']]

# Convert the observation_date column to a datetime object
filtered_data['observation_date'] = pd.to_datetime(filtered_data['observation_date'], format='%d-%b-%Y')

# Sort the DataFrame by date
filtered_data = filtered_data.sort_values('observation_date', ascending=True)

# Get the first 10 rows of the DataFrame
temp_data = filtered_data.head(10)

# Show the properties of the DataFrame
filtered_data.info()

# Show the first 5 rows of the DataFrame
filtered_data.head()

<class 'pandas.core.frame.DataFrame'>
Index: 611 entries, 1481 to 1995
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   longitude         611 non-null    float64       
 1   latitude          611 non-null    float64       
 2   observation_date  611 non-null    datetime64[ns]
 3   number_found      552 non-null    float64       
 4   imap_id           611 non-null    int64         
 5   county            611 non-null    object        
 6   uuid              611 non-null    object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 38.2+ KB


Unnamed: 0,longitude,latitude,observation_date,number_found,imap_id,county,uuid
1481,-73.42304,43.999399,2007-10-01,500.0,1044919,Essex,5bf83951-e829-4f82-ae6a-485b8c604805
1138,-73.758744,42.517456,2009-06-01,1000.0,1046342,Rensselaer,933d30dc-d903-4e93-b02c-aeab5ee36e79
2583,-73.676875,42.816924,2011-07-01,100000.0,1046289,Saratoga,b3055c5c-32bb-441a-80d1-6ad06777cc50
1761,-78.983102,43.231229,2011-07-01,10000.0,1045635,Niagara,dc3ad2ef-3eed-420c-91d1-5c611856f203
1023,-78.743216,42.90526,2011-08-01,10000.0,1051045,Erie,95ba6da9-9d9c-4f2f-90c6-63b1afd5988b


In [63]:
# Select the latitude and longitude columns
coordinates = filtered_data[['latitude', 'longitude']]

# Define the number of clusters
num_clusters = 8

# Create an instance of KMeans with the specified number of clusters
kmeans = KMeans(n_clusters=num_clusters)

# Fit the KMeans model to the coordinates
kmeans.fit(coordinates)

# Add a new column to the filtered_data DataFrame with the cluster labels
filtered_data['cluster'] = kmeans.labels_


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [66]:
import folium

# Create a map centered around New York
ny_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)

# Define a color dictionary
color_dict = {0: 'blue', 1: 'green', 2: 'red', 3: 'purple', 4: 'orange', 5: 'darkred', 6: 'lightred', 7: 'beige', 8: 'darkblue', 9: 'darkgreen', 10: 'cadetblue'}

# Add circle markers for latitude and longitude
for index, row in filtered_data.iterrows():
    # Get the cluster number
    cluster_number = row['cluster']

    # Define the fill color based on the cluster number
    fill_color = color_dict[cluster_number]

    # Add the circle marker with the fill color and no border
    folium.CircleMarker(location=[row['latitude'], row['longitude']], fill_color=fill_color, fill_opacity=1, color=None).add_to(ny_map)

# Display the map
ny_map


In [50]:
grouped_data = filtered_data.groupby('county')

# Get the number of rows in each group
grouped_data.count()



Unnamed: 0_level_0,longitude,latitude,observation_date,number_found,imap_id,uuid
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Albany,47,47,47,40,47,47
Broome,3,3,3,2,3,3
Cattaraugus,3,3,3,3,3,3
Cayuga,7,7,7,7,7,7
Chautauqua,25,25,25,23,25,25
Chemung,1,1,1,1,1,1
Clinton,39,39,39,39,39,39
"Clinton,Le Haut-Richelieu",1,1,1,1,1,1
Columbia,11,11,11,11,11,11
Dutchess,4,4,4,3,4,4


In [48]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

# Create a map centered around New York
ny_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)

# Define the start and end colors for the gradient
start_color = 'blue'
end_color = 'red'

# Create a linear gradient between the start and end colors
gradient = mcolors.LinearSegmentedColormap.from_list('gradient', [start_color, end_color])

# Add circle markers for latitude and longitude
for index, row in filtered_data.iterrows():
    date = row['observation_date']
    year = date.year

    # Calculate the color based on the year
    color = gradient((year - 2007) / (2023 - 2007))

    # Add the circle marker with the gradient fill color
    folium.CircleMarker(location=[row['latitude'], row['longitude']], fill_color=color).add_to(ny_map)

# Display the map
ny_map


In [43]:
temp_data

Unnamed: 0,longitude,latitude,observation_date,number_found,imap_id,county,uuid
1481,-73.42304,43.999399,2007-10-01,500.0,1044919,Essex,5bf83951-e829-4f82-ae6a-485b8c604805
1138,-73.758744,42.517456,2009-06-01,1000.0,1046342,Rensselaer,933d30dc-d903-4e93-b02c-aeab5ee36e79
2583,-73.676875,42.816924,2011-07-01,100000.0,1046289,Saratoga,b3055c5c-32bb-441a-80d1-6ad06777cc50
1761,-78.983102,43.231229,2011-07-01,10000.0,1045635,Niagara,dc3ad2ef-3eed-420c-91d1-5c611856f203
1023,-78.743216,42.90526,2011-08-01,10000.0,1051045,Erie,95ba6da9-9d9c-4f2f-90c6-63b1afd5988b
133,-73.800482,42.880285,2011-08-01,500.0,1046328,Saratoga,571f25d2-1272-4c34-ab0a-099788d6cfef
1984,-73.675845,42.819436,2011-09-01,10000.0,1046288,Saratoga,ef503267-3dd3-40c5-804d-50f36d4c7f5e
2664,-73.965638,41.92871,2012-04-01,300.0,1050528,Ulster,66709c9e-6269-4a9b-af1f-bb5ef818a827
122,-76.436823,43.325053,2012-05-01,300.0,1125500,Oswego,5fffdfb2-0012-442b-a112-8572a9009f1c
78,-73.881553,42.189445,2012-05-01,500.0,1046279,Greene,a2619cc2-404f-42e9-b13b-85a89c724431


In [41]:
# Set the location for which you want to get data
latitude = 39.7392
longitude = -104.9844
start_date = "20231004"
end_date = "20231005"

# Make the request to the Daily API
url = f"https://power.larc.nasa.gov/api/temporal/daily/point?start={start_date}&end={end_date}&latitude={latitude}&longitude={longitude}&community=ag&parameters=WS10M%2CWD10M%2CT2MDEW%2CT2MWET%2CT2M%2CV10M%2CRH2M%2CPS%2CPRECTOT%2CQV2M%2CU10M&header=true&time-standard=lst"
response = requests.get(url)

# Check the status code of the response
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    print(data)

    # # Print the daily solar and meteorological data
    # for timeStep in data["timeSeries"]:
    #     print("Timestamp:", timeStep["startTime"])
    #     # Print all the parameters
    #     for parameter, value in timeStep["values"].items():
    #         print(f"{parameter}:", value)
    #     print()
else:
    print("Error:", response.status_code)

{'type': 'Feature', 'geometry': {'type': 'Point', 'coordinates': [-104.9844, 39.7392, 2094.96]}, 'properties': {'parameter': {'PS': {'20231004': 79.31, '20231005': 79.72}, 'RH2M': {'20231004': 57.56, '20231005': 59.0}, 'WS10M': {'20231004': 2.35, '20231005': 2.13}, 'T2MWET': {'20231004': 5.72, '20231005': 6.47}, 'T2M': {'20231004': 10.17, '20231005': 10.76}, 'WD10M': {'20231004': 206.75, '20231005': 159.0}, 'U10M': {'20231004': 0.75, '20231005': -0.01}, 'QV2M': {'20231004': 5.31, '20231005': 5.62}, 'V10M': {'20231004': 0.02, '20231005': 0.09}, 'T2MDEW': {'20231004': 1.26, '20231005': 2.18}, 'PRECTOTCORR': {'20231004': 0.11, '20231005': 0.0}}}, 'header': {'title': 'NASA/POWER CERES/MERRA2 Native Resolution Daily Data', 'api': {'version': 'v2.5.1', 'name': 'POWER Daily API'}, 'sources': ['merra2', 'power'], 'fill_value': -999.0, 'start': '20231004', 'end': '20231005'}, 'messages': [], 'parameters': {'PS': {'units': 'kPa', 'longname': 'Surface Pressure'}, 'RH2M': {'units': '%', 'longname'