In [67]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [107]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")

# Drop ID columns
df = df.drop(columns=['Unnamed: 0', 'ID'])

# Identify NDVI columns (all columns ending with '_N')
ndvi_columns = [col for col in df.columns if col.endswith('_N')]
ndvi_columns_sorted = sorted(ndvi_columns)  # Sort by date

# Interpolate missing NDVI values along the row (time series)
df[ndvi_columns_sorted] = df[ndvi_columns_sorted].interpolate(axis=1, method='linear', limit_direction='both')

# Remove outliers based on IQR for selected NDVI columns
outlier_cols = ['20140813_N', '20140407_N', '20140202_N', '20140509_N']

def get_iqr_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    return Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

def drop_outliers(df, columns):
    mask = pd.Series(True, index=df.index)
    for col in columns:
        lower, upper = get_iqr_bounds(df[col])
        mask = mask & (df[col] >= lower) & (df[col] <= upper)
    return df.loc[mask]

df_clean = drop_outliers(df, outlier_cols).copy()

print(f"Rows before dropping outliers: {df.shape[0]}")
print(f"Rows after dropping outliers: {df_clean.shape[0]}")

# Feature engineering: add NDVI statistics
df_clean['ndvi_mean'] = df_clean[ndvi_columns_sorted].mean(axis=1)
df_clean['ndvi_std'] = df_clean[ndvi_columns_sorted].std(axis=1)
df_clean['ndvi_min'] = df_clean[ndvi_columns_sorted].min(axis=1)
df_clean['ndvi_max'] = df_clean[ndvi_columns_sorted].max(axis=1)
df_clean['ndvi_median'] = df_clean[ndvi_columns_sorted].median(axis=1)
df_clean['ndvi_range'] = df_clean['ndvi_max'] - df_clean['ndvi_min']

# Encode target 'class'
label_encoder = LabelEncoder()
df_clean['class'] = label_encoder.fit_transform(df_clean['class'])

# Split features and target
X = df_clean.drop(columns=['class'])
y = df_clean['class']

# Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

# Train-test split (stratify to keep class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize Logistic Regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=2000)

# Cross-validation with stratified folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=skf, scoring='accuracy')

print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

# Train on train set
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Classification report
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_,
    zero_division=0
))

Rows before dropping outliers: 8000
Rows after dropping outliers: 6696
Cross-validation accuracy scores: [0.91492537 0.91492537 0.91492537 0.92089552 0.92089552 0.90597015
 0.90881913 0.90134529 0.91778774 0.92526158]
Mean CV accuracy: 0.9145751065301295
              precision    recall  f1-score   support

        farm       0.76      0.58      0.66       136
      forest       0.94      0.98      0.96      1115
       grass       0.79      0.66      0.72        35
  impervious       0.60      0.62      0.61        45
     orchard       0.00      0.00      0.00         5
       water       0.00      0.00      0.00         4

    accuracy                           0.91      1340
   macro avg       0.52      0.47      0.49      1340
weighted avg       0.90      0.91      0.91      1340



In [117]:
import pandas as pd
import numpy as np

# Load test data
test_data = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")

# Save ID column for submission
ID = test_data['ID']

# Drop ID and Unnamed columns if present
test_data = test_data.drop(columns=[col for col in ['ID', 'Unnamed: 0'] if col in test_data.columns])

# Identify and sort NDVI columns
ndvi_columns = [col for col in test_data.columns if col.endswith('_N')]
ndvi_columns_sorted = sorted(ndvi_columns)

# Interpolate missing NDVI values (row-wise)
test_data[ndvi_columns_sorted] = test_data[ndvi_columns_sorted].interpolate(axis=1, method='linear', limit_direction='both')

# Feature engineering: same as training
test_data['ndvi_mean'] = test_data[ndvi_columns_sorted].mean(axis=1)
test_data['ndvi_std'] = test_data[ndvi_columns_sorted].std(axis=1)
test_data['ndvi_min'] = test_data[ndvi_columns_sorted].min(axis=1)
test_data['ndvi_max'] = test_data[ndvi_columns_sorted].max(axis=1)
test_data['ndvi_median'] = test_data[ndvi_columns_sorted].median(axis=1)
test_data['ndvi_range'] = test_data['ndvi_max'] - test_data['ndvi_min']

# Use all features (NDVI + engineered)
X_test = test_data

# Scale using fitted scaler from training
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

# Predict
y_pred = model.predict(X_test_scaled)

# Decode labels to original class names
y_decoded = label_encoder.inverse_transform(y_pred)

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})

# Save to CSV in required format
submission.to_csv("submission.csv", index=False)

# Preview
print(submission)


        ID    class
0        1   forest
1        2   forest
2        3  orchard
3        4   forest
4        5   forest
...    ...      ...
2840  2841    water
2841  2842    water
2842  2843    water
2843  2844    water
2844  2845    water

[2845 rows x 2 columns]


In [112]:
result.to_csv("submission.csv", index=False)