In [None]:
pip install "numpy<2"


In [None]:
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
from scipy.interpolate import interp1d
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Load new data
train_data = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv')
test_data = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv')

# Preprocessing function
def preprocess_ndvi(data):
    # Extract NDVI columns (assuming format like '20200101_N')
    ndvi_cols = [col for col in data.columns if col.endswith('_N')]
    dates = [pd.to_datetime(col.split('_')[0]) for col in ndvi_cols]
    
    # Sort columns by date
    sorted_cols = [x for _, x in sorted(zip(dates, ndvi_cols))]
    sorted_dates = sorted(dates)
    
    # Create NDVI matrix
    ndvi_matrix = data[sorted_cols].values
    
    # Interpolation for missing values
    for i in range(ndvi_matrix.shape[0]):
        valid_mask = ~np.isnan(ndvi_matrix[i])
        if np.sum(valid_mask) > 1:
            f = interp1d(np.where(valid_mask)[0], ndvi_matrix[i, valid_mask], 
                         kind='linear', fill_value='extrapolate')
            ndvi_matrix[i] = f(np.arange(ndvi_matrix.shape[1]))
        elif np.sum(valid_mask) == 1:
            ndvi_matrix[i] = np.where(valid_mask, ndvi_matrix[i, valid_mask], ndvi_matrix[i, valid_mask])
    
    # Apply Savitzky-Golay filter
    ndvi_matrix = savgol_filter(ndvi_matrix, window_length=5, polyorder=2, axis=1)
    
    return ndvi_matrix, sorted_dates

# Feature engineering
def extract_features(ndvi_matrix, dates):
    features = {
        'mean': np.mean(ndvi_matrix, axis=1),
        'median': np.median(ndvi_matrix, axis=1),
        'std': np.std(ndvi_matrix, axis=1),
        'min': np.min(ndvi_matrix, axis=1),
        'max': np.max(ndvi_matrix, axis=1),
        'range': np.ptp(ndvi_matrix, axis=1),
    }
    
    # Slope (trend)
    x = np.arange(ndvi_matrix.shape[1])
    slopes = []
    for row in ndvi_matrix:
        if np.all(~np.isnan(row)):
            slope = np.polyfit(x, row, 1)[0]
        else:
            slope = np.nan
        slopes.append(slope)
    features['slope'] = slopes

    # Seasonal statistics
    seasons = []
    for date in dates:
        month = date.month
        if month in [12, 1, 2]:
            seasons.append('winter')
        elif month in [3, 4, 5]:
            seasons.append('spring')
        elif month in [6, 7, 8]:
            seasons.append('summer')
        else:
            seasons.append('fall')
    
    unique_seasons = list(set(seasons))
    for season in unique_seasons:
        season_mask = np.array([s == season for s in seasons])
        features[f'mean_{season}'] = np.mean(ndvi_matrix[:, season_mask], axis=1)
        features[f'std_{season}'] = np.std(ndvi_matrix[:, season_mask], axis=1)
    
    return pd.DataFrame(features)

# Process training data
X_train_ndvi, dates = preprocess_ndvi(train_data)
X_train_features = extract_features(X_train_ndvi, dates)
y_train = train_data['class']  # Make sure 'class' column exists

# Process test data
X_test_ndvi, _ = preprocess_ndvi(test_data)
X_test_features = extract_features(X_test_ndvi, dates)

# Model pipeline
pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, class_weight='balanced')
)




# Parameter grid for logistic regression
param_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100]
}

# Grid search cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_features, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predictions on test data
test_preds = best_model.predict(X_test_features)

# Prepare submission
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Make sure 'ID' column exists in test set
    'class': test_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")
print(submission)


In [None]:
import os
print(os.getcwd())
