In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

class NDVILandCoverClassifier:
    def __init__(self):
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')
        self.label_encoder = LabelEncoder()
        self.model = LogisticRegression(
            max_iter=1000,
            multi_class='ovr',
            random_state=42,
            C=1.0
        )
        self.feature_columns = []
        
    def extract_temporal_features(self, df):
        """Extract meaningful features from NDVI time series"""
        # Get NDVI columns (assuming they end with '_N')
        ndvi_cols = [col for col in df.columns if col.endswith('_N')]
        
        # Store original NDVI data
        ndvi_data = df[ndvi_cols].copy()
        
        # Create feature dataframe
        features = pd.DataFrame(index=df.index)
        
        # Basic statistical features
        features['ndvi_mean'] = ndvi_data.mean(axis=1)
        features['ndvi_std'] = ndvi_data.std(axis=1)
        features['ndvi_min'] = ndvi_data.min(axis=1)
        features['ndvi_max'] = ndvi_data.max(axis=1)
        features['ndvi_range'] = features['ndvi_max'] - features['ndvi_min']
        features['ndvi_median'] = ndvi_data.median(axis=1)
        
        # Percentiles
        features['ndvi_25th'] = ndvi_data.quantile(0.25, axis=1)
        features['ndvi_75th'] = ndvi_data.quantile(0.75, axis=1)
        features['ndvi_iqr'] = features['ndvi_75th'] - features['ndvi_25th']
        
        # Temporal trend features
        features['ndvi_trend'] = ndvi_data.apply(self._calculate_trend, axis=1)
        features['ndvi_seasonality'] = ndvi_data.apply(self._calculate_seasonality, axis=1)
        
        # Growing season features (assume peak growing season is middle of year)
        mid_year_cols = ndvi_cols[len(ndvi_cols)//3:2*len(ndvi_cols)//3]
        if mid_year_cols:
            features['growing_season_mean'] = df[mid_year_cols].mean(axis=1)
            features['growing_season_max'] = df[mid_year_cols].max(axis=1)
        
        # Vegetation vigor (high NDVI values indicate healthy vegetation)
        features['high_ndvi_count'] = (ndvi_data > 0.5).sum(axis=1)
        features['low_ndvi_count'] = (ndvi_data < 0.2).sum(axis=1)
        
        # Missing data indicators
        features['missing_count'] = ndvi_data.isnull().sum(axis=1)
        features['missing_ratio'] = features['missing_count'] / len(ndvi_cols)
        
        # Add original NDVI values (important for logistic regression)
        for col in ndvi_cols:
            features[f'original_{col}'] = df[col]
        
        return features
    
    def _calculate_trend(self, series):
        """Calculate linear trend of NDVI values"""
        valid_data = series.dropna()
        if len(valid_data) < 2:
            return 0
        
        x = np.arange(len(valid_data))
        try:
            slope = np.polyfit(x, valid_data.values, 1)[0]
            return slope
        except:
            return 0
    
    def _calculate_seasonality(self, series):
        """Calculate measure of seasonality"""
        valid_data = series.dropna()
        if len(valid_data) < 3:
            return 0
        
        # Simple seasonality measure: coefficient of variation
        return valid_data.std() / (valid_data.mean() + 1e-8)
    
    def preprocess_data(self, df, is_training=True):
        """Preprocess the data with feature engineering"""
        print("Extracting temporal features...")
        features = self.extract_temporal_features(df)
        
        print("Handling missing values...")
        # Impute missing values
        if is_training:
            features_imputed = pd.DataFrame(
                self.imputer.fit_transform(features),
                columns=features.columns,
                index=features.index
            )
        else:
            features_imputed = pd.DataFrame(
                self.imputer.transform(features),
                columns=features.columns,
                index=features.index
            )
        
        # Store feature columns
        if is_training:
            self.feature_columns = features_imputed.columns.tolist()
        
        print("Scaling features...")
        # Scale features
        if is_training:
            features_scaled = pd.DataFrame(
                self.scaler.fit_transform(features_imputed),
                columns=features_imputed.columns,
                index=features_imputed.index
            )
        else:
            features_scaled = pd.DataFrame(
                self.scaler.transform(features_imputed),
                columns=features_imputed.columns,
                index=features_imputed.index
            )
        
        return features_scaled
    
    def train(self, train_df):
        """Train the model"""
        print("Starting training process...")
        
        # Separate features and target
        X = train_df.drop(['class', 'ID'], axis=1, errors='ignore')
        y = train_df['class']
        
        # Encode labels
        y_encoded = self.label_encoder.fit_transform(y)
        
        # Preprocess features
        X_processed = self.preprocess_data(X, is_training=True)
        
        # Train model
        print("Training logistic regression model...")
        self.model.fit(X_processed, y_encoded)
        
        # Cross-validation score
        cv_scores = cross_val_score(self.model, X_processed, y_encoded, cv=5, scoring='accuracy')
        print(f"Cross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        
        return self
    
    def predict(self, test_df):
        """Make predictions"""
        print("Making predictions...")
        
        # Preprocess features
        X_test = test_df.drop(['ID'], axis=1, errors='ignore')
        X_processed = self.preprocess_data(X_test, is_training=False)
        
        # Make predictions
        y_pred_encoded = self.model.predict(X_processed)
        y_pred = self.label_encoder.inverse_transform(y_pred_encoded)
        
        return y_pred
    
    def create_submission(self, test_df, predictions, filename='submission.csv'):
        """Create submission file"""
        submission = pd.DataFrame({
            'ID': test_df['ID'],
            'class': predictions
        })
        
        # Ensure proper formatting
        submission['class'] = submission['class'].str.lower()
        
        submission.to_csv(filename, index=False)
        print(f"Submission saved to {filename}")
        print(f"Submission shape: {submission.shape}")
        print("Class distribution in predictions:")
        print(submission['class'].value_counts())
        
        return submission

def main():
    """Main execution function"""
    print("NDVI Land Cover Classification")
    print("=" * 50)
    
    # Initialize classifier
    classifier = NDVILandCoverClassifier()
    
    try:
        # Load data
        print("Loading training data...")
        train_df = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv')
        print(f"Training data shape: {train_df.shape}")
        
        print("Loading test data...")
        test_df = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv')
        print(f"Test data shape: {test_df.shape}")
        
        # Display basic info
        print("\nClass distribution in training data:")
        print(train_df['class'].value_counts())
        
        print("\nSample NDVI columns:")
        ndvi_cols = [col for col in train_df.columns if col.endswith('_N')]
        print(f"Found {len(ndvi_cols)} NDVI time points")
        print("First few NDVI columns:", ndvi_cols[:5])
        
        # Check for missing values
        print(f"\nMissing values in training data: {train_df.isnull().sum().sum()}")
        print(f"Missing values in test data: {test_df.isnull().sum().sum()}")
        
        # Train model
        classifier.train(train_df)
        
        # Make predictions
        predictions = classifier.predict(test_df)
        
        # Create submission
        submission = classifier.create_submission(test_df, predictions, 'submission.csv')
        
        print("\n" + "=" * 50)
        print("SUCCESS: Model trained and submission file created!")
        print("File: submission.csv")
        print("Format: ID,class")
        print("Ready for Kaggle submission!")
        
    except FileNotFoundError as e:
        print(f"Error: Could not find data files. Please ensure 'train.csv' and 'test.csv' are in the current directory.")
        print("Expected files:")
        print("- train.csv (with columns: ID, class, and NDVI time series)")
        print("- test.csv (with columns: ID and NDVI time series)")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        print("Please check your data format and try again.")

# Alternative function for custom data loading
def train_and_predict_custom(train_path, test_path, submission_path='submission.csv'):
    """
    Custom function for different file paths
    """
    classifier = NDVILandCoverClassifier()
    
    # Load data
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # Train and predict
    classifier.train(train_df)
    predictions = classifier.predict(test_df)
    
    # Create submission
    submission = classifier.create_submission(test_df, predictions, submission_path)
    
    return submission

if __name__ == "__main__":
    main()

NDVI Land Cover Classification
Loading training data...
Training data shape: (8000, 30)
Loading test data...
Test data shape: (2845, 29)

Class distribution in training data:
class
forest        6159
farm           841
impervious     669
grass          196
water          105
orchard         30
Name: count, dtype: int64

Sample NDVI columns:
Found 27 NDVI time points
First few NDVI columns: ['20150720_N', '20150602_N', '20150517_N', '20150501_N', '20150415_N']

Missing values in training data: 25040
Missing values in test data: 0
Starting training process...
Extracting temporal features...
Handling missing values...
Scaling features...
Training logistic regression model...
Cross-validation accuracy: 0.8946 (+/- 0.0411)
Making predictions...
Extracting temporal features...
Handling missing values...
Scaling features...
Submission saved to submission.csv
Submission shape: (2845, 2)
Class distribution in predictions:
class
forest        1656
farm           512
impervious     391
grass     