In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [2]:
# Load data
train = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv').drop(columns=['Unnamed: 0'])
test = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv')

In [3]:
train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,1,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,2,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,-1670.59,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,4,water,58.0174,-1599.16,,-1052.63,,-1564.63,,729.79,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,5,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,683.254,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,8,water,1136.44,,,1647.83,1935.8,,2158.98,,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [4]:
# Extract NDVI columns
ndvi_cols = [col for col in train.columns if '_N' in col]
train_ndvi = train[ndvi_cols].copy()
test_ndvi = test[ndvi_cols].copy()

In [5]:
# Advanced Imputation & Smoothing
# Linear interpolation + forward/backward fill
train_ndvi.interpolate(method='linear', axis=1, inplace=True)
train_ndvi.bfill(axis=1, inplace=True)
train_ndvi.ffill(axis=1, inplace=True)

test_ndvi.interpolate(method='linear', axis=1, inplace=True)
test_ndvi.bfill(axis=1, inplace=True)
test_ndvi.ffill(axis=1, inplace=True)

In [6]:
# Smooth with rolling mean (denoising)
window = 5
train_smoothed = train_ndvi.rolling(window=window, axis=1, min_periods=1).mean()
test_smoothed = test_ndvi.rolling(window=window, axis=1, min_periods=1).mean()

  train_smoothed = train_ndvi.rolling(window=window, axis=1, min_periods=1).mean()
  test_smoothed = test_ndvi.rolling(window=window, axis=1, min_periods=1).mean()


In [7]:
# Enhanced Feature Engineering
def create_features(df):
    features = pd.DataFrame()
    
    # Basic stats
    features['mean'] = df.mean(axis=1)
    features['std'] = df.std(axis=1)
    features['min'] = df.min(axis=1)
    features['max'] = df.max(axis=1)
    features['median'] = df.median(axis=1)
    features['range'] = features['max'] - features['min']
    features['q25'] = df.quantile(0.25, axis=1)
    features['q75'] = df.quantile(0.75, axis=1)
    
    # Advanced time-series features
    x = np.arange(df.shape[1])
    features['slope'] = df.apply(lambda row: np.polyfit(x, row, 1)[0], axis=1)
    features['intercept'] = df.apply(lambda row: np.polyfit(x, row, 1)[1], axis=1)
    features['trend_strength'] = features['slope'] / (features['std'] + 1e-6)
    
    # Rolling features
    features['rolling_avg_3'] = df.rolling(window=3, axis=1).mean().iloc[:, -1]
    features['rolling_avg_5'] = df.rolling(window=5, axis=1).mean().iloc[:, -1]
    features['rolling_std_5'] = df.rolling(window=5, axis=1).std().iloc[:, -1]
    
    # Change features
    features['mean_abs_change'] = df.diff(axis=1).abs().mean(axis=1)
    features['max_abs_change'] = df.diff(axis=1).abs().max(axis=1)
    
    # Peak/valley detection
    features['n_peaks'] = df.apply(lambda row: len(np.where(np.diff(np.sign(np.diff(row))) < 0)[0]), axis=1)
    
    return features

X_train = create_features(train_smoothed)
X_test = create_features(test_smoothed)
y_train = train['class']

  features['rolling_avg_3'] = df.rolling(window=3, axis=1).mean().iloc[:, -1]
  features['rolling_avg_5'] = df.rolling(window=5, axis=1).mean().iloc[:, -1]
  features['rolling_std_5'] = df.rolling(window=5, axis=1).std().iloc[:, -1]
  features['rolling_avg_3'] = df.rolling(window=3, axis=1).mean().iloc[:, -1]
  features['rolling_avg_5'] = df.rolling(window=5, axis=1).mean().iloc[:, -1]
  features['rolling_std_5'] = df.rolling(window=5, axis=1).std().iloc[:, -1]


In [8]:
selector = VarianceThreshold(threshold=0.01)
X_train = selector.fit_transform(X_train)
X_test = selector.transform(X_test)

In [9]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(
    Cs=[0.1, 1, 10],
    cv=5,
    max_iter=1000,
    penalty='l2',
    solver='lbfgs',
    multi_class='multinomial',
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_scaled, y_train)
print("Best C:", model.C_)

Best C: [10. 10. 10. 10. 10. 10.]


In [11]:
best_model = model
best_model.fit(X_train_scaled, y_train)

In [12]:
y_test_pred = best_model.predict(X_test_scaled)

submission = pd.DataFrame({
    'ID': test['ID'],
    'class': y_test_pred
})
submission.to_csv('submission.csv', index=False)