# Forest Cover Type Prediction - Neural Network

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation='logistic',
    solver='adam',
    learning_rate='constant',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))




NN Validation Accuracy: 0.87683
              precision    recall  f1-score   support

           1       0.86      0.89      0.88     12710
           2       0.90      0.88      0.89     16998
           3       0.85      0.90      0.87      2145
           4       0.84      0.79      0.82       165
           5       0.72      0.66      0.69       570
           6       0.76      0.73      0.75      1042
           7       0.92      0.89      0.90      1231

    accuracy                           0.88     34861
   macro avg       0.84      0.82      0.83     34861
weighted avg       0.88      0.88      0.88     34861



In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation='logistic',
    solver='adam',
    learning_rate='constant',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.89019
              precision    recall  f1-score   support

           1       0.90      0.87      0.89     12710
           2       0.89      0.92      0.91     16998
           3       0.89      0.88      0.88      2145
           4       0.86      0.78      0.82       165
           5       0.80      0.63      0.71       570
           6       0.78      0.80      0.79      1042
           7       0.92      0.90      0.91      1231

    accuracy                           0.89     34861
   macro avg       0.86      0.82      0.84     34861
weighted avg       0.89      0.89      0.89     34861



In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation='tanh',
    solver='adam',
    learning_rate='constant',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.89444
              precision    recall  f1-score   support

           1       0.89      0.89      0.89     12710
           2       0.91      0.91      0.91     16998
           3       0.90      0.87      0.89      2145
           4       0.84      0.77      0.80       165
           5       0.75      0.71      0.73       570
           6       0.76      0.83      0.80      1042
           7       0.94      0.92      0.93      1231

    accuracy                           0.89     34861
   macro avg       0.85      0.84      0.85     34861
weighted avg       0.89      0.89      0.89     34861



In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation='relu',
    solver='adam',
    learning_rate='constant',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.87212
              precision    recall  f1-score   support

           1       0.89      0.84      0.86     12710
           2       0.87      0.92      0.89     16998
           3       0.85      0.87      0.86      2145
           4       0.85      0.78      0.81       165
           5       0.75      0.59      0.66       570
           6       0.76      0.69      0.72      1042
           7       0.90      0.90      0.90      1231

    accuracy                           0.87     34861
   macro avg       0.84      0.80      0.82     34861
weighted avg       0.87      0.87      0.87     34861



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.89444
              precision    recall  f1-score   support

           1       0.89      0.89      0.89     12710
           2       0.91      0.91      0.91     16998
           3       0.90      0.87      0.89      2145
           4       0.84      0.77      0.80       165
           5       0.75      0.71      0.73       570
           6       0.76      0.83      0.80      1042
           7       0.94      0.92      0.93      1231

    accuracy                           0.89     34861
   macro avg       0.85      0.84      0.85     34861
weighted avg       0.89      0.89      0.89     34861



adding more layers

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.92028
              precision    recall  f1-score   support

           1       0.92      0.93      0.92     12710
           2       0.94      0.93      0.93     16998
           3       0.90      0.90      0.90      2145
           4       0.74      0.82      0.78       165
           5       0.78      0.80      0.79       570
           6       0.83      0.83      0.83      1042
           7       0.92      0.92      0.92      1231

    accuracy                           0.92     34861
   macro avg       0.86      0.87      0.87     34861
weighted avg       0.92      0.92      0.92     34861



adding an extra layer, adding regularization factor alpha and increasing max iterations for increasing accuracy

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(128,64,32),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42,
    alpha=0.0001
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.92373
              precision    recall  f1-score   support

           1       0.93      0.92      0.92     12710
           2       0.93      0.94      0.94     16998
           3       0.91      0.91      0.91      2145
           4       0.82      0.76      0.79       165
           5       0.77      0.80      0.78       570
           6       0.82      0.86      0.84      1042
           7       0.93      0.93      0.93      1231

    accuracy                           0.92     34861
   macro avg       0.87      0.87      0.87     34861
weighted avg       0.92      0.92      0.92     34861



increasing alpha, and layer weights, also increasing max iterations to improve accuracy

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1500,
    random_state=42,
    alpha=0.0005
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.93032
              precision    recall  f1-score   support

           1       0.93      0.93      0.93     12710
           2       0.94      0.95      0.94     16998
           3       0.92      0.90      0.91      2145
           4       0.83      0.81      0.82       165
           5       0.83      0.78      0.81       570
           6       0.84      0.87      0.86      1042
           7       0.95      0.93      0.94      1231

    accuracy                           0.93     34861
   macro avg       0.89      0.88      0.89     34861
weighted avg       0.93      0.93      0.93     34861



adding early stopping parameter to reduce the overfitting, increasing alpha more, adding one more layer , also increasing max iterations to improve accuracy

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128,128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=2000,
    random_state=42,
    alpha=0.001,
    early_stopping=True
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.92846
              precision    recall  f1-score   support

           1       0.93      0.93      0.93     12710
           2       0.94      0.94      0.94     16998
           3       0.92      0.91      0.91      2145
           4       0.86      0.81      0.83       165
           5       0.78      0.80      0.79       570
           6       0.84      0.86      0.85      1042
           7       0.94      0.93      0.94      1231

    accuracy                           0.93     34861
   macro avg       0.89      0.88      0.88     34861
weighted avg       0.93      0.93      0.93     34861



In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.30,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128,128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=2000,
    random_state=42,
    alpha=0.0005,
    early_stopping=True
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.92849
              precision    recall  f1-score   support

           1       0.93      0.93      0.93     12710
           2       0.94      0.94      0.94     16998
           3       0.91      0.92      0.92      2145
           4       0.91      0.75      0.82       165
           5       0.84      0.75      0.79       570
           6       0.86      0.84      0.85      1042
           7       0.95      0.91      0.93      1231

    accuracy                           0.93     34861
   macro avg       0.91      0.86      0.88     34861
weighted avg       0.93      0.93      0.93     34861



no significant improvement, so i'm tried training the model on 40% of the whole data to see the results

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.40,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1500,
    random_state=42,
    alpha=0.0005
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.93759
              precision    recall  f1-score   support

           1       0.95      0.93      0.94     16947
           2       0.94      0.96      0.95     22664
           3       0.93      0.91      0.92      2860
           4       0.83      0.80      0.81       220
           5       0.84      0.82      0.83       759
           6       0.85      0.88      0.87      1390
           7       0.92      0.96      0.94      1641

    accuracy                           0.94     46481
   macro avg       0.89      0.89      0.89     46481
weighted avg       0.94      0.94      0.94     46481



since increasing the sub sample lead to increase in accuracy, i'm increasing the sub sample size more

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.50,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1500,
    random_state=42,
    alpha=0.0005
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.94322
              precision    recall  f1-score   support

           1       0.94      0.95      0.94     21184
           2       0.95      0.95      0.95     28330
           3       0.94      0.94      0.94      3576
           4       0.84      0.83      0.84       275
           5       0.86      0.82      0.84       949
           6       0.88      0.88      0.88      1737
           7       0.95      0.93      0.94      2051

    accuracy                           0.94     58102
   macro avg       0.91      0.90      0.90     58102
weighted avg       0.94      0.94      0.94     58102



In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

df = pd.read_csv('covtype.csv')

df_sub, _ = train_test_split(
    df,
    train_size=0.80,
    random_state=42,
    stratify=df['Cover_Type']
)

X = df_sub.drop(columns=['Cover_Type'])
y = df_sub['Cover_Type']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

nn_model = MLPClassifier(
    hidden_layer_sizes=(256,128,64),
    activation='tanh',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1500,
    random_state=42,
    alpha=0.0005
)

nn_model.fit(X_train_scaled, y_train)

y_pred_nn = nn_model.predict(X_val_scaled)

print(f"NN Validation Accuracy: {accuracy_score(y_val, y_pred_nn):.5f}")
print(classification_report(y_val, y_pred_nn))


NN Validation Accuracy: 0.95195
              precision    recall  f1-score   support

           1       0.95      0.95      0.95     33894
           2       0.96      0.96      0.96     45328
           3       0.96      0.93      0.94      5721
           4       0.79      0.86      0.82       439
           5       0.87      0.85      0.86      1519
           6       0.89      0.92      0.91      2779
           7       0.96      0.96      0.96      3282

    accuracy                           0.95     92962
   macro avg       0.91      0.92      0.91     92962
weighted avg       0.95      0.95      0.95     92962

