In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

2023-09-10 22:54:58.238696: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [57]:
df = pd.read_csv("./Data/Combined_Lebanon_Snow_Data.csv")

In [53]:
df.head()

Unnamed: 0,longitude,latitude,time,sde
188,35.25,33.0,1993-01-01,0.0
212,35.25,33.1,1993-01-01,0.0
134,35.25,33.2,1993-01-01,0.0
78,35.25,33.3,1993-01-01,0.0
116,35.25,33.4,1993-01-01,0.0


## Data Manipulation

In [41]:
# Convert time to a numerical feature (e.g., number of days since the first date)
df['time'] = pd.to_datetime(df['time'])
df['time'] = (df['time'] - df['time'].min()).dt.days

In [42]:
df.head()

Unnamed: 0,longitude,latitude,time,sde
0,35.65,34.5,0,
1,36.35,34.3,0,0.067378
2,35.95,33.6,0,0.007813
3,36.35,33.7,0,0.004883
4,36.35,33.1,0,0.0


In [43]:
# Sort the DataFrame
df.sort_values(by=['time', 'longitude', 'latitude'], inplace=True)

In [44]:
from sklearn.metrics import pairwise_distances

# Function to find closest points based on spatial-temporal features
def find_closest_points(idx, df, k=5):
    missing_row = df.loc[[idx], :]
    
    # Drop the row with the missing value
    available_data = df.drop(index=idx)
    
    # Calculate pairwise distance
    dists = pairwise_distances(missing_row[['time', 'longitude', 'latitude']], available_data[['time', 'longitude', 'latitude']])
    
    # Get the indices of the closest k points
    closest_points = np.argsort(dists)[:, :k]
    
    return available_data.index[closest_points[0]]

In [45]:
# Identify missing rows
missing_indices = df[df['sde'].isna()].index.tolist()

In [46]:
y, n = 0, 0

# Loop through missing indices and impute them
for idx in missing_indices:
    closest_points = find_closest_points(idx, df)
    
    # Filter out rows with missing 'sde' values
    closest_points = [i for i in closest_points if not np.isnan(df.loc[i, 'sde'])]
    
    # Prepare data for training
    X_train = df.loc[closest_points, ['time', 'longitude', 'latitude']]
    y_train = df.loc[closest_points, 'sde']
    
    # If we have enough data to train, proceed
    if len(y_train) > 0 and not y_train.isna().any():
        y+=1
        # Prepare data for prediction
        X_test = df.loc[[idx], ['time', 'longitude', 'latitude']]
        
        # Train Random Forest Regressor
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        
        # Impute missing value
        predicted_sde = model.predict(X_test)
        df.loc[idx, 'sde'] = predicted_sde[0]
    else:
        n+=1
        # If not enough data to train, you can choose to fill with a default value or other simpler methods
        df.loc[idx, 'sde'] = df['sde'].mean()  # Filling with mean as a fallback


KeyboardInterrupt: 

In [None]:
# Convert time back to datetime if needed
df['time'] = pd.to_timedelta(df['time'], unit='D') + pd.to_datetime('1993-01-01')

In [None]:
df.head()

In [52]:
from sklearn.neighbors import KNeighborsRegressor
# Convert time to a numerical feature (e.g., number of days since the first date)
df['time'] = pd.to_datetime(df['time'])
df['time'] = (df['time'] - df['time'].min()).dt.days

# Sort the DataFrame
df.sort_values(by=['time', 'longitude', 'latitude'], inplace=True)

def batch_impute_missing(df, missing_indices, k=5):
    # Prepare data for training
    available_data = df.drop(index=missing_indices)
    X_available = available_data[['time', 'longitude', 'latitude']]
    y_available = available_data['sde']
    
    # Prepare data for prediction
    X_missing = df.loc[missing_indices, ['time', 'longitude', 'latitude']]
    
    # Train distance-weighted k-NN Regressor
    model = KNeighborsRegressor(n_neighbors=k, weights='distance')
    model.fit(X_available, y_available)
    
    # Impute missing values
    predicted_sde = model.predict(X_missing)
    df.loc[missing_indices, 'sde'] = predicted_sde
    
    return df

# Identify missing rows
missing_indices = df[df['sde'].isna()].index.tolist()

# Batch impute missing values
df = batch_impute_missing(df, missing_indices)

# Convert time back to datetime if needed
df['time'] = pd.to_timedelta(df['time'], unit='D') + pd.to_datetime('1993-01-01')

print(df)

         longitude  latitude       time           sde
188          35.25      33.0 1993-01-01  0.000000e+00
212          35.25      33.1 1993-01-01  0.000000e+00
134          35.25      33.2 1993-01-01  0.000000e+00
78           35.25      33.3 1993-01-01  0.000000e+00
116          35.25      33.4 1993-01-01  0.000000e+00
...            ...       ...        ...           ...
1778296      36.45      34.2 2022-12-30  7.324446e-03
1778299      36.45      34.3 2022-12-30  2.930055e-03
1778289      36.45      34.4 2022-12-30  4.619360e-07
1778272      36.45      34.5 2022-12-30  0.000000e+00
1778359      36.45      34.6 2022-12-30  0.000000e+00

[1778387 rows x 4 columns]


In [59]:
from sklearn.ensemble import GradientBoostingRegressor

# Convert time to a numerical feature (e.g., number of days since the first date)
df['time'] = pd.to_datetime(df['time'])
df['time'] = (df['time'] - df['time'].min()).dt.days

# Sort the DataFrame
df.sort_values(by=['time', 'longitude', 'latitude'], inplace=True)

def batch_impute_missing(df, missing_indices):
    # Prepare data for training
    available_data = df.drop(index=missing_indices)
    X_available = available_data[['time', 'longitude', 'latitude']]
    y_available = available_data['sde']
    
    # Prepare data for prediction
    X_missing = df.loc[missing_indices, ['time', 'longitude', 'latitude']]
    
    # Train Gradient Boosting Regressor
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model.fit(X_available, y_available)
    
    # Impute missing values
    predicted_sde = model.predict(X_missing)
    df.loc[missing_indices, 'sde'] = predicted_sde
    
    return df

# Identify missing rows
missing_indices = df[df['sde'].isna()].index.tolist()

# Batch impute missing values
df = batch_impute_missing(df, missing_indices)

# Convert time back to datetime if needed
df['time'] = pd.to_timedelta(df['time'], unit='D') + pd.to_datetime('1993-01-01')

print(df)


         longitude  latitude       time  sde
188          35.25      33.0 1993-01-01  0.0
299          35.25      33.0 1993-01-01  0.0
637          35.25      33.0 1993-01-01  0.0
861          35.25      33.0 1993-01-01  0.0
1009         35.25      33.0 1993-01-01  0.0
...            ...       ...        ...  ...
1777493      36.45      34.6 1993-01-01  0.0
1777608      36.45      34.6 1993-01-01  0.0
1777754      36.45      34.6 1993-01-01  0.0
1778112      36.45      34.6 1993-01-01  0.0
1778359      36.45      34.6 1993-01-01  0.0

[1778387 rows x 4 columns]


In [12]:
# Sort your dataframe by time, longitude, and latitude
df = df.sort_values(['time', 'longitude', 'latitude'])

In [5]:
# Identify rows with missing 'sde'
missing_rows = df['sde'].isna()

# Prepare data for training the imputation model
X_train = df.loc[~missing_rows, ['time', 'longitude', 'latitude']]
y_train = df.loc[~missing_rows, 'sde']
X_missing = df.loc[missing_rows, ['time', 'longitude', 'latitude']]

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Train the imputer model
imputer_model = RandomForestRegressor(n_estimators=100, random_state=42)
imputer_model.fit(X_train, y_train)

In [7]:
# Impute missing values
predicted_sde = imputer_model.predict(X_missing)
df.loc[missing_rows, 'sde'] = predicted_sde

In [9]:
df.head()

Unnamed: 0,longitude,latitude,time,sde
0,35.65,34.5,0,0.009742
1,36.35,34.3,0,0.067378
2,35.95,33.6,0,0.007813
3,36.35,33.7,0,0.004883
4,36.35,33.1,0,0.0
