In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the data from the CSV file
file_path = 'C:/Users/Tom Ingalls/Dropbox (ASU)/Dryland/Jornada_pixel_values_with_location - Copy.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,system:index,AFG,BGR,LTR,PFG,SHR,TRE,latitude,longitude
0,0,2,44,18,12,18,0,32.63566,-106.846923
1,1,2,44,18,13,18,0,32.63566,-106.846653
2,2,2,45,18,13,18,0,32.63566,-106.846384
3,3,1,46,18,12,18,0,32.63566,-106.846114
4,4,1,46,19,12,17,0,32.63566,-106.845844


In [3]:
def identify_subsets(df, column):
    highest_5_percent = df[column].quantile(0.95)
    lowest_5_percent = df[column].quantile(0.05)
    mode_value = df[column].mode().iloc[0]
    
    high_subset = df[df[column] >= highest_5_percent]
    low_subset = df[df[column] <= lowest_5_percent]
    mode_subset = df[df[column] == mode_value]
    
    return high_subset, low_subset, mode_subset


In [11]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # radius of Earth in meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    meters = R * c  # output distance in meters
    return meters

def sample_pixels(df, column_order, sample_size=15, min_distance=200, seed=42):
    np.random.seed(seed)
    sampled_indices = set()
    sampled_df = pd.DataFrame(columns=df.columns)  # Initialize with same structure
    
    for column in column_order:
        high_subset, low_subset, mode_subset = identify_subsets(df, column)
        
        # Function to get random samples ensuring no resampling of pixels
        def get_random_samples(subset, existing_indices, sampled_df):
            valid_samples = subset[~subset.index.isin(existing_indices)]
            samples = []
            while len(samples) < sample_size and not valid_samples.empty:
                sample = valid_samples.sample(1, random_state=seed)
                lat1, lon1 = sample['latitude'].values[0], sample['longitude'].values[0]
                if not sampled_df.empty:
                    distances = [haversine(lat1, lon1, lat2, lon2) for lat2, lon2 in zip(sampled_df['latitude'], sampled_df['longitude'])]
                else:
                    distances = [min_distance + 1]  # to ensure the first sample is always taken
                if all(d >= min_distance for d in distances):
                    samples.append(sample)
                    existing_indices.add(sample.index[0])
                    sampled_df = pd.concat([sampled_df, sample], ignore_index=True)  # Update the sampled_df with the new sample
                valid_samples = valid_samples.drop(sample.index)
            return pd.concat(samples, ignore_index=True) if samples else pd.DataFrame(columns=subset.columns), sampled_df
        
        sampled_high, sampled_df = get_random_samples(high_subset, sampled_indices, sampled_df)
        sampled_low, sampled_df = get_random_samples(low_subset, sampled_indices, sampled_df)
        sampled_mode, sampled_df = get_random_samples(mode_subset, sampled_indices, sampled_df)
        
        sampled_df = pd.concat([sampled_df, sampled_high, sampled_low, sampled_mode], ignore_index=True)
    
    # Remove any duplicates
    sampled_df = sampled_df.drop_duplicates(ignore_index=True)
    
    return sampled_df

# Define the column order
column_order = ['AFG', 'PFG', 'TRE', 'SHR', 'BGR', 'LTR']

# Sample the data
sampled_pixels_df = sample_pixels(data, column_order, seed=42)

# Display the sampled pixels
sampled_pixels_df.reset_index(drop=True, inplace=True)  # Resetting index for better readability
sampled_pixels_df


Unnamed: 0,system:index,AFG,BGR,LTR,PFG,SHR,TRE,latitude,longitude
0,59472,4,45,24,12,14,1,32.566938,-106.806768
1,13086,4,44,16,11,20,1,32.618142,-106.845305
2,38650,4,38,25,19,9,0,32.594157,-106.829675
3,55946,4,42,18,12,13,1,32.574215,-106.829944
4,40604,4,43,17,8,24,0,32.592540,-106.791407
...,...,...,...,...,...,...,...,...,...
265,19240,1,47,17,9,22,0,32.611944,-106.856894
266,34473,2,47,17,12,17,0,32.597930,-106.820512
267,17196,3,48,17,5,22,0,32.614100,-106.814583
268,16195,3,48,17,10,16,0,32.614908,-106.865248


In [10]:
# Save the entire sampled dataframe to a text file
sampled_pixels_df.to_csv('sampled_pixels_data.txt', index=False, sep='\t')

# # Save only the latitude and longitude columns to a separate text file
# sampled_pixels_df[['latitude', 'longitude']].to_csv('sampled_lat_lon.txt', index=False, sep='\t')

with open('sampled_lat_lon.txt', 'w') as file:
    for lat, lon in zip(sampled_pixels_df['latitude'], sampled_pixels_df['longitude']):
        file.write(f'[{lat}, {lon}],\n')