## Loading Data

In [2]:
import pandas as pd

# Load the dataset (replace 'file_path.csv' with the actual path)
df = pd.read_csv(r"D:\Downloads\Projects-20240722T093004Z-001\Projects\forest_cover_prediction\forest_cover_prediction\train.csv")

# Display basic info
print(df.info())

# Show the first few rows
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype
---  ------                              --------------  -----
 0   Id                                  15120 non-null  int64
 1   Elevation                           15120 non-null  int64
 2   Aspect                              15120 non-null  int64
 3   Slope                               15120 non-null  int64
 4   Horizontal_Distance_To_Hydrology    15120 non-null  int64
 5   Vertical_Distance_To_Hydrology      15120 non-null  int64
 6   Horizontal_Distance_To_Roadways     15120 non-null  int64
 7   Hillshade_9am                       15120 non-null  int64
 8   Hillshade_Noon                      15120 non-null  int64
 9   Hillshade_3pm                       15120 non-null  int64
 10  Horizontal_Distance_To_Fire_Points  15120 non-null  int64
 11  Wilderness_Area1                    15120 non-null  int64
 12  Wild

In [3]:
# Check for missing values
print(df.isnull().sum())


Id                                    0
Elevation                             0
Aspect                                0
Slope                                 0
Horizontal_Distance_To_Hydrology      0
Vertical_Distance_To_Hydrology        0
Horizontal_Distance_To_Roadways       0
Hillshade_9am                         0
Hillshade_Noon                        0
Hillshade_3pm                         0
Horizontal_Distance_To_Fire_Points    0
Wilderness_Area1                      0
Wilderness_Area2                      0
Wilderness_Area3                      0
Wilderness_Area4                      0
Soil_Type1                            0
Soil_Type2                            0
Soil_Type3                            0
Soil_Type4                            0
Soil_Type5                            0
Soil_Type6                            0
Soil_Type7                            0
Soil_Type8                            0
Soil_Type9                            0
Soil_Type10                           0


In [4]:
# Display summary statistics
print(df.describe(include="all"))


                Id     Elevation        Aspect         Slope  \
count  15120.00000  15120.000000  15120.000000  15120.000000   
mean    7560.50000   2749.322553    156.676653     16.501587   
std     4364.91237    417.678187    110.085801      8.453927   
min        1.00000   1863.000000      0.000000      0.000000   
25%     3780.75000   2376.000000     65.000000     10.000000   
50%     7560.50000   2752.000000    126.000000     15.000000   
75%    11340.25000   3104.000000    261.000000     22.000000   
max    15120.00000   3849.000000    360.000000     52.000000   

       Horizontal_Distance_To_Hydrology  Vertical_Distance_To_Hydrology  \
count                      15120.000000                    15120.000000   
mean                         227.195701                       51.076521   
std                          210.075296                       61.239406   
min                            0.000000                     -146.000000   
25%                           67.000000         

## Separate feature and target

In [5]:
# Assume 'Cover_Type' is the target variable
X = df.drop('Cover_Type', axis=1)
y = df['Cover_Type']



In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Train Test Split

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)


## Model Training

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.67      0.69      0.68       421
           2       0.65      0.54      0.59       438
           3       0.63      0.52      0.57       428
           4       0.77      0.88      0.82       449
           5       0.71      0.80      0.75       416
           6       0.61      0.63      0.62       432
           7       0.91      0.91      0.91       440

    accuracy                           0.71      3024
   macro avg       0.71      0.71      0.71      3024
weighted avg       0.71      0.71      0.71      3024



In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

print("Random Forest Accuracy:", rf.score(X_test, y_test))


Random Forest Accuracy: 0.8809523809523809


In [12]:
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score


In [15]:
# ✅ Encode labels: 1–7 to 0–6
y_train_encoded = y_train - 1
y_test_encoded = y_test - 1

# Initialize and train XGBoost model
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=7,  # Now it's 7 classes: 0–6
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

xgb_model.fit(X_train, y_train_encoded)

# Predict
y_pred_encoded = xgb_model.predict(X_test)

# ✅ Decode predictions back: 0–6 → 1–7
y_pred = y_pred_encoded + 1

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))













Accuracy: 0.8432539682539683
Classification Report:
               precision    recall  f1-score   support

           1       0.76      0.76      0.76       421
           2       0.79      0.65      0.71       438
           3       0.82      0.79      0.80       428
           4       0.92      0.98      0.95       449
           5       0.85      0.95      0.90       416
           6       0.81      0.82      0.81       432
           7       0.93      0.96      0.94       440

    accuracy                           0.84      3024
   macro avg       0.84      0.84      0.84      3024
weighted avg       0.84      0.84      0.84      3024

