In [8]:

import ee
import pandas as pd
import matplotlib.pyplot as plt


In [9]:
ee.Initialize(project='ee-bhatia-research')

In [10]:
#collection of land coverage

# Land surface temp.
lst = ee.ImageCollection("MODIS/061/MOD11A1") 



In [11]:
# Initial date of interest (inclusive).
i_date = '2018-01-01'

# Final date of interest (exclusive).
f_date = '2024-01-01'

# Selection of appropriate bands and dates for LST.
lst = lst.select('LST_Day_1km', 'QC_Day').filterDate(i_date, f_date)

In [24]:
filename = 'world_cereals_scaled_final_all.csv'
chunks = []
counter = 0

for chunk in pd.read_csv(filename, chunksize=1000):
    # Processing each chunk
    df = chunk.copy()  # Make a copy to avoid modifying the original chunk
    df = df[df['Crop'] == 'Maize']
    
    # Append the modified chunk to the list of chunks
    chunks.append(df)
    
    counter += 1
    print(counter)
    
    # Break out of the loop after processing the first chunk
    break

df = pd.concat(chunks, ignore_index=True)

1


In [25]:
print(df.describe)

<bound method NDFrame.describe of      Unnamed: 0        lon        lat   Crop Country (Admin 0)  \
0             0 -62.663466 -19.297688  Maize           Bolivia   
1             1 -62.955466 -19.531271  Maize           Bolivia   
2             2 -62.807133 -19.596771  Maize           Bolivia   
3             3 -62.991550 -19.735437  Maize           Bolivia   
4             4 -62.974216 -19.668354  Maize           Bolivia   
..          ...        ...        ...    ...               ...   
995         995 -63.332133 -22.448597  Maize         Argentina   
996         996 -63.297800 -22.399431  Maize         Argentina   
997         997 -63.287133 -22.455764  Maize         Argentina   
998         998 -63.261133 -22.404847  Maize         Argentina   
999         999 -63.293633 -22.396264  Maize         Argentina   

    State (Admin 1)        Region (World Bank)      Subregion      Continent  \
0        Santa Cruz  Latin America & Caribbean  South America  South America   
1        Sant

In [26]:
def create_gee_point(row):
    return ee.Geometry.Point([row['lon'], row['lat']])

df['GEE_Point'] = df.apply(create_gee_point, axis=1)

In [27]:
def extract_lst(point, start_date, end_date):
    lst = ee.ImageCollection("MODIS/061/MOD11A1") \
            .select('LST_Day_1km') \
            .filterDate(start_date, end_date) \
            .filterBounds(point)
    
    def get_lst(image):
        lst_value = image.reduceRegion(reducer=ee.Reducer.mean(), geometry=point, scale=1000)
        return ee.Feature(None, {'LST_Day_1km': lst_value.get('LST_Day_1km'), 'date': image.date().format('YYYY-MM-dd')})
    
    lst_series = lst.map(get_lst).getInfo()
    lst_data = {}
    for feature in lst_series['features']:
        properties = feature['properties']
        date = properties['date']
        lst_data[date] = properties.get('LST_Day_1km', None)  # Use .get to avoid KeyError
    
    return lst_data


In [28]:
lst_data = {}
for index, row in df.iterrows():
    point = row['GEE_Point']
    point_id = f"Point_{index}"
    lst_data[point_id] = extract_lst(point, i_date, f_date)

In [29]:
for point_id, data in lst_data.items():
    for date, temp in data.items():
        if temp is not None:
            data[date] = (temp * 0.02) - 273.15
        else:
            data[date] = None

In [None]:
all_dates = pd.date_range(start=i_date, end=f_date, freq='D').strftime('%Y-%m-%d')

# Initialize an empty DataFrame with dates as the index
temperature_df = pd.DataFrame(index=all_dates)

# Populate the DataFrame with the extracted LST data
for point_id, data in lst_data.items():
    temperature_df[point_id] = pd.Series(data)
temperature_df.dropna(inplace=True)

In [33]:
from sklearn.decomposition import PCA
print(temperature_df.head())

pca = PCA(n_components=10)  # Adjust the number of components as needed
principal_components = pca.fit_transform(temperature_df)

# Create a new DataFrame with the principal components
pca_columns = [f'PC{i+1}' for i in range(principal_components.shape[1])]
pca_df = pd.DataFrame(principal_components, index=temperature_df.index, columns=pca_columns)

# Check the DataFrame with principal components
print(pca_df.head())

            Point_0  Point_1  Point_2  Point_3  Point_4  Point_5  Point_6  \
2018-02-22    36.37    29.15    30.11    28.43    28.99    29.59    32.19   
2018-05-13    31.31    28.81    28.21    29.57    27.29    27.55    30.87   
2018-05-20    28.39    24.83    23.63    26.59    24.03    24.55    26.69   
2018-08-10    29.67    28.87    27.03    30.13    29.85    29.87    32.89   
2018-08-12    34.53    32.21    31.53    33.21    33.57    32.99    35.11   

            Point_7  Point_8  Point_9  ...  Point_990  Point_991  Point_992  \
2018-02-22    30.27    31.75    31.97  ...      32.11      28.23      32.11   
2018-05-13    28.55    28.97    29.77  ...      27.35      25.01      27.35   
2018-05-20    25.45    25.31    26.55  ...      22.91      21.53      22.91   
2018-08-10    29.93    32.15    30.99  ...      26.85      23.07      26.85   
2018-08-12    33.11    34.81    34.05  ...      31.61      27.33      31.61   

            Point_993  Point_994  Point_995  Point_996  Point_