In [338]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

# Load data
train_df = pd.read_csv("hacktrain.csv")
test_df = pd.read_csv("hacktest.csv")

# Clean columns
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
test_df = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# NDVI columns
ndvi_columns = [col for col in train_df.columns if col.endswith('_N')]

# Base features
def add_base_features(df):
    df['ndvi_mean'] = df[ndvi_columns].mean(axis=1)
    df['ndvi_std'] = df[ndvi_columns].std(axis=1)
    df['ndvi_min'] = df[ndvi_columns].min(axis=1)
    df['ndvi_max'] = df[ndvi_columns].max(axis=1)
    df['ndvi_range'] = df['ndvi_max'] - df['ndvi_min']
    df['ndvi_median'] = df[ndvi_columns].median(axis=1)
    df['ndvi_skew'] = df[ndvi_columns].skew(axis=1)
    df['ndvi_trend'] = df[ndvi_columns].iloc[:, -1] - df[ndvi_columns].iloc[:, 0]
    return df

# Season features
def add_seasonal_features(df):
    season_map = {'spring': [], 'summer': [], 'monsoon': [], 'autumn': [], 'winter': []}
    for col in ndvi_columns:
        month = int(col.split('_')[0][4:6])
        if month in [3, 4, 5]:
            season_map['spring'].append(col)
        elif month in [6, 7]:
            season_map['summer'].append(col)
        elif month in [8, 9]:
            season_map['monsoon'].append(col)
        elif month == 10:
            season_map['autumn'].append(col)
        elif month in [11, 12, 1, 2]:
            season_map['winter'].append(col)
    for season, cols in season_map.items():
        if cols:
            df[f'{season}_mean'] = df[cols].mean(axis=1)
            df[f'{season}_std'] = df[cols].std(axis=1)
    return df

# NDVI trend/difference
def add_trend_features(df):
    for i in range(len(ndvi_columns) - 1):
        df[f'diff_{i}'] = df[ndvi_columns[i + 1]] - df[ndvi_columns[i]]
    df['trend_mean'] = df[[f'diff_{i}' for i in range(len(ndvi_columns) - 1)]].mean(axis=1)
    df['trend_std'] = df[[f'diff_{i}' for i in range(len(ndvi_columns) - 1)]].std(axis=1)
    return df

# Moving average smoothing
def add_smooth_features(df):
    df['ndvi_movavg3'] = df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).mean().mean(axis=1)
    df['ndvi_movstd3'] = df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).std().mean(axis=1)
    return df

# Apply all feature engineering
for df in [train_df, test_df]:
    df = add_base_features(df)
    df = add_seasonal_features(df)
    df = add_trend_features(df)
    df = add_smooth_features(df)

# Prepare X/y
y = train_df['class']
feature_cols = train_df.drop(columns=['ID', 'class']).columns
X = train_df[feature_cols]
X_test = test_df[feature_cols]

# Drop low-variance
low_var_cols = X.columns[X.std() < 0.01]
X.drop(columns=low_var_cols, inplace=True)
X_test.drop(columns=low_var_cols, inplace=True)

# Impute missing
imputer = KNNImputer(n_neighbors=3)
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# PCA (95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Polynomial features (degree=2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_pca_poly = poly.fit_transform(X_pca)
X_test_pca_poly = poly.transform(X_test_pca)

# Encode y
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split train/val
X_train, X_val, y_train, y_val = train_test_split(
    X_pca_poly, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Logistic regression (final)
model = LogisticRegression(
    multi_class='multinomial',
    solver='saga',
    C=50,
    max_iter=3000,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train, y_train)

# Validation accuracy
y_val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, y_val_pred)
print("🧪 Validation Accuracy:", round(val_acc * 100, 2), "%")

# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_pca_poly, y_encoded, cv=skf, scoring='accuracy')
print("📊 Cross-Validated Accuracy:", round(np.mean(cv_scores) * 100, 2), "%")

# Final predictions
y_test_pred = model.predict(X_test_pca_poly)
y_test_labels = le.inverse_transform(y_test_pred)

# Save submission
submission = pd.DataFrame({'ID': test_df['ID'], 'class': y_test_labels})
submission.to_csv("ndvi_submission_final.csv", index=False)
print("✅ Submission saved as: ndvi_submission_final.csv")


  df['ndvi_movavg3'] = df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).mean().mean(axis=1)
  df['ndvi_movstd3'] = df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).std().mean(axis=1)
  df['ndvi_movavg3'] = df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).mean().mean(axis=1)
  df['ndvi_movstd3'] = df[ndvi_columns].rolling(window=3, axis=1, min_periods=1).std().mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=low_var_cols, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=low_var_cols, inplace=True)


🧪 Validation Accuracy: 94.25 %




📊 Cross-Validated Accuracy: 94.19 %
✅ Submission saved as: ndvi_submission_final.csv


