In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv")
df

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

 # **Alot null data is present in our dataset so we need to remove it**

In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df

In [None]:
# we need to drop unnamed column 
df = df.drop(columns=['Unnamed: 0'])

In [None]:
df

In [None]:
# Class distribution
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
sns.countplot(data=df, x='class', order=df['class'].value_counts().index)
plt.title('Class Distribution')
plt.xticks(rotation=45)
plt.show()

# Class percentages
print("\nClass percentages:\n", df['class'].value_counts(normalize=True)*100)

In [None]:
# Extracting dates from column names
ndvi_cols = [col for col in df.columns if '_N' in col]
dates = pd.to_datetime([col.split('_')[0] for col in ndvi_cols], format='%Y%m%d')

print(f"Time span: {dates.min()} to {dates.max()}")
print(f"Total duration: {dates.max() - dates.min()}")
print(f"Number of time points: {len(dates)}")
print(f"Approximate frequency: {pd.infer_freq(dates)}")

In [None]:
# Summary statistics
print(df[ndvi_cols].describe())

In [None]:
# Boxplot of NDVI values by class (sample a few time points)
sample_dates = ndvi_cols[::5]  # every 5th time point
for date_col in sample_dates:
    plt.figure(figsize=(10,6))
    sns.boxplot(data=df, x='class', y=date_col)
    plt.title(f'NDVI Distribution for {date_col.split("_")[0]}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
print(f"NDVI min: {df[ndvi_cols].min().min()}")
print(f"NDVI max: {df[ndvi_cols].max().max()}")

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df[ndvi_cols].stack())
plt.title('NDVI Value Distribution')
plt.show()

In [None]:
# IQR Method
Q1 = df[ndvi_cols].quantile(0.25)
Q3 = df[ndvi_cols].quantile(0.75)
IQR = Q3 - Q1

outlier_mask = ((df[ndvi_cols] < (Q1 - 1.5*IQR)) | (df[ndvi_cols] > (Q3 + 1.5*IQR)))
outliers = outlier_mask.any(axis=1)
print(f"Number of outlier samples (IQR method): {outliers.sum()}")

In [None]:
# Conservative scaling (adjust based on your sensor's specs)
phys_min, phys_max = -3000, 3000  # Wider than theoretical range
df[ndvi_cols] = df[ndvi_cols].clip(phys_min, phys_max)

In [None]:
# Define reasonable ranges per class (adjust these based on your EDA)
class_limits = {
    'water': (-2500, 800),       # Water can have negative reflectance
    'impervious': (-1000, 2000),  # Urban areas
    'farm': (500, 3000),         # Active vegetation
    'forest': (1000, 3500),      # Dense vegetation
    'grass': (300, 2500),        # Sparse vegetation
    'orchard': (800, 3200)       # Perennial crops
}

for class_name, (c_min, c_max) in class_limits.items():
    class_mask = df['class'] == class_name
    df.loc[class_mask, ndvi_cols] = df.loc[class_mask, ndvi_cols].clip(c_min, c_max)

In [None]:
print(df['class'].value_counts(normalize=True))

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

# df.drop(columns=['ID'], inplace=True)

label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])


ndvi_cols = [col for col in df.columns if '_N' in col]

# Interpolate missing NDVI values row-wise
df[ndvi_cols] = df[ndvi_cols].interpolate(axis=1, limit_direction='both')

# Fill remaining NaNs with row median
df[ndvi_cols] = df[ndvi_cols].T.fillna(df[ndvi_cols].median(axis=1)).T

In [None]:
#  Train-Test Split
X = df.drop(columns=['class'])
y = df['class']

# Feature scaling (important for logistic regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

#  Train Logistic Regression
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=500,
    C=1.0,
    random_state=42
)
model.fit(X_train, y_train)


# . Evaluate Model
y_pred = model.predict(X_test)

target_names = [str(cls) for cls in label_encoder.classes_]

print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(target_names))),
    target_names=target_names
))

In [None]:
test_data = pd.read_csv("/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv")
test_data.shape

In [None]:


# Drop ID or Unnamed columns
# test_data.drop(columns=['ID'], errors='ignore', inplace=True)
test_data.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)

# NDVI columns only
ndvi_cols = [col for col in test_data.columns if '_N' in col]

# Interpolate missing values row-wise
test_data[ndvi_cols] = test_data[ndvi_cols].interpolate(axis=1, limit_direction='both')
test_data[ndvi_cols] = test_data[ndvi_cols].T.fillna(test_data[ndvi_cols].median(axis=1)).T



In [None]:
y_test = model.predict(test_data)

In [None]:
test_data

In [None]:
y_decoded = label_encoder.inverse_transform(y_test)
y_decoded

In [None]:
result = pd.DataFrame({
    'ID': ID,
    'class': y_decoded
})

In [None]:
result

In [None]:
result.to_csv("submission.csv", index=False) 