In [47]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('hacktrain.csv')

In [49]:
# Feature Engineering
def create_temporal_features(df):
    date_cols = [col for col in df.columns if '_N' in col]
    date_cols.sort(reverse=True)

    for i in range(len(date_cols)-1):
        df[f'diff_{i}'] = df[date_cols[i]] - df[date_cols[i+1]]

    df['row_mean'] = df[date_cols].mean(axis=1)
    df['row_std'] = df[date_cols].std(axis=1)
    df['row_min'] = df[date_cols].min(axis=1)
    df['row_max'] = df[date_cols].max(axis=1)

    return df

df = create_temporal_features(df)

numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = IterativeImputer(random_state=42)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [50]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [51]:
# Drop ID and separate target
X = df.drop(columns=['ID', 'Unnamed: 0', 'class'], errors='ignore')
y_raw = df['class']
le = LabelEncoder()
y = le.fit_transform(y_raw)

In [52]:
# pca
pca = PCA(n_components=0.95, random_state=42)
X = pca.fit_transform(X)

In [53]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Logistic Regression model
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [54]:
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Classification Report on Test Set:
              precision    recall  f1-score   support

        farm       0.48      0.64      0.55       168
      forest       0.97      0.73      0.83      1232
       grass       0.27      0.62      0.38        39
  impervious       0.78      0.78      0.78       134
     orchard       0.03      0.83      0.05         6
       water       0.29      0.71      0.42        21

    accuracy                           0.72      1600
   macro avg       0.47      0.72      0.50      1600
weighted avg       0.88      0.72      0.78      1600



In [55]:
test_df = pd.read_csv('hacktest.csv')
test_ids = test_df['ID']
test_df = create_temporal_features(test_df)
test_df[numeric_cols] = imputer.transform(test_df[numeric_cols])
test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])
X_final = test_df.drop(columns=['ID', 'Unnamed: 0'], errors='ignore')
X_final = pca.transform(X_final)

In [56]:
# predictions 
final_preds = lr.predict(X_final)
final_labels = le.inverse_transform(final_preds)

In [57]:
submission = pd.DataFrame({'ID': test_ids, 'class': final_labels})
submission.to_csv('submission5.csv', index=False)
print("\n✅ submission5.csv saved!")


✅ submission5.csv saved!


In [58]:
df4 = pd.read_csv('submission5.csv')
df4.head()

Unnamed: 0,ID,class
0,1,orchard
1,2,forest
2,3,orchard
3,4,orchard
4,5,forest


In [59]:
value = df4['class'].value_counts()
print(value)

class
forest        1132
farm           599
impervious     395
grass          311
orchard        225
water          183
Name: count, dtype: int64
