# Ipmort Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading the Train DataSet

In [None]:
#reading train dataset
train_data = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv')

In [None]:
train_data

In [None]:
#the columns 'Unnamed: 0' and 'ID' are not needed so we can drop them
train_data.drop(columns=['Unnamed: 0','ID'],inplace=True)
train_data.head()

In [None]:
train_data.info()

In [None]:
#we can see we have some null values in all the columns of NDVI values
#thus we will fill it with the mean NDVI values by each column mean

In [None]:
train_data.describe()

In [None]:
#for this we will identify NDVI feature columns i.e columns ending with '_N'
ndvi_columns = [col for col in train_data.columns if col.endswith('_N')]
ndvi_columns

In [None]:
#filling up the missing values of each column by the mean of that column 
train_data[ndvi_columns] = train_data[ndvi_columns].fillna(train_data[ndvi_columns].mean())
train_data.head(10)

In [None]:
train_data.isnull().sum() #check if any nun values left

In [None]:
train_data['class'].value_counts()

In [None]:
# forest is too much dominating, we need to balance the data
#the balancing would be done by adding (class_weight='balanced') in LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [None]:
label_encoder = LabelEncoder()
train_data['class'] = label_encoder.fit_transform(train_data['class'])
#label encoded the target variable

In [None]:
X = train_data.drop(columns=['class'])
y = train_data['class']
# preparing training data(features and target)

In [None]:
X

In [None]:
y

## Feature engineering

In [None]:
X_df = pd.DataFrame(X, columns=X.columns)
X_df['NDVI_mean'] = X_df.mean(axis=1)  #average vegetation level
X_df['NDVI_std'] = X_df.std(axis=1)    #seasonal variation in NDVI
X_df['NDVI_max'] = X_df.max(axis=1)    #peak vegitation level
X_df['NDVI_min'] = X_df.min(axis=1)    #lowest NVDI
X_df['NDVI_range'] = X_df['NDVI_max'] - X_df['NDVI_min']  #NVDI spread
X_df['NDVI_median'] = X_df.median(axis=1) #central tendency,less affected by outliers
X_df['NDVI_q25'] = X_df.quantile(0.25, axis=1)  #lower quartile
X_df['NDVI_q75'] = X_df.quantile(0.75, axis=1)  #upper quartile
X_df['NDVI_iqr'] = X_df['NDVI_q75'] - X_df['NDVI_q25']  #interquartile range
X_df['NDVI_skew'] = X_df.skew(axis=1)    #measures asymmetry of vegetation curve

X_df.head()

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_test.head()

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
#we splitted the train data in 80-20 ratio so that we can use it for validation of our model


In [None]:
y_test.head()

In [None]:
print(y_train.shape)
print(y_test.shape)

In [None]:
#after splitting scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model and its training

In [None]:
model = LogisticRegression(multi_class='multinomial' , solver='lbfgs',max_iter=1000,C=100,penalty='l2',class_weight='balanced')
model.fit(X_train_scaled,y_train) 
# fitting LR(multinomial)

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred
# predictions made on the X_test(validation)

In [None]:
y_pred_class = label_encoder.inverse_transform(y_pred)
pd.Series(y_pred_class).value_counts()

In [None]:
#we have applied class_weight='balanced' but still the forest is dominating because in the data
#we have forest in large nubmer as LR isn't powerful enough to distinguish classes well just
#by adjusting weights

In [None]:
#verifying the model by f1 and f2 scores and accuracy

from sklearn.metrics import accuracy_score, f1_score, fbeta_score, classification_report


acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
f2 = fbeta_score(y_test, y_pred, beta=2, average='weighted')
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f" Accuracy: {acc:.4f}")
print(f" F1 Score: {f1:.4f}")
print(f" F2 Score: {f2:.4f}")
print("\n Classification Report:\n", report)

## Reading the Test DataSet

In [None]:
test_data = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv')
test_data

In [None]:
test_data.isnull().sum()  #no null

In [None]:
test_data.drop(columns=['Unnamed: 0'],inplace=True)
test_data.head(10) #we will drop id later

In [None]:
ID=test_data['ID'] #we will store the ids of these 2845 entries (for the submission csv)
test_data.drop(['ID'],axis=1,inplace=True) #now drop ID

In [None]:
test_df = pd.DataFrame(test_data, columns=test_data.columns)
test_df['NDVI_mean'] = test_df.mean(axis=1)
test_df['NDVI_std'] = test_df.std(axis=1)
test_df['NDVI_max'] = test_df.max(axis=1)
test_df['NDVI_min'] = test_df.min(axis=1)
test_df['NDVI_range'] = test_df['NDVI_max'] - test_df['NDVI_min']
test_df['NDVI_median'] = test_df.median(axis=1)
test_df['NDVI_q25'] = test_df.quantile(0.25, axis=1)
test_df['NDVI_q75'] = test_df.quantile(0.75, axis=1)
test_df['NDVI_iqr'] = test_df['NDVI_q75'] - test_df['NDVI_q25']
test_df['NDVI_skew'] = test_df.skew(axis=1)

test_df.head()

#performing the same feature engineering as train data

## Use the model to predtict the class for the Test DataSet

In [None]:
test_data_scaled=scaler.transform(test_df)
y_test = model.predict(test_data_scaled)
y_test

#scaling the test_data

In [None]:
y_decoded = label_encoder.inverse_transform(y_test)
#decoding the values into original class names

In [None]:
y_decoded

In [None]:
submission= pd.DataFrame({'ID':ID , 'class': y_decoded})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False)
