In [5]:
# PHASE 1: DATA PREPROCESSING
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the Data
df = pd.read_csv(r"C:\Users\kunal\Downloads\deforestation_dataset.csv")

# 2. Inspect the dataset
print(df.info())
print(df.head())
print(df.isnull().sum())

# 3. Handle missing values
df = df.fillna(df.median(numeric_only=True))
df = df.dropna()  # Drop rows where categorical NA remain

# 4. Encode categorical variables
categorical_cols = ['Deforestation_Policy_Strictness', 'Corruption_Index']
le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    le_dict[col] = le

# 5. Feature selection
features = [col for col in df.columns if col not in ['Country', 'Year', 'Forest_Loss_Area_km2', 'Tree_Cover_Loss_percent']]
# You can change the target variable here:
target = 'Tree_Cover_Loss_percent'

X = df[features]
y = df[target]

# 6. Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle=True)

# PHASE 2: MODEL BUILDING AND EVALUATION
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Train SVM (linear kernel)
svm_model = SVR(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

# 2. Evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae}\nMSE: {mse}\nRMSE: {rmse}\nR^2: {r2}")

# 3. Feature importance (linear kernel SVM)
if hasattr(svm_model, 'coef_'):
    feature_importance = pd.Series(np.abs(svm_model.coef_[0]), index=features)
    plt.figure(figsize=(10,5))
    feature_importance.sort_values(ascending=False).plot(kind='bar')
    plt.title('Feature Importance from SVM')
    plt.show()

# PHASE 2.2: HYPERPARAMETER TUNING (Optional)
from sklearn.model_selection import GridSearchCV

params = {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
grid = GridSearchCV(SVR(), params, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)
print("Best Parameters:", grid.best_params_)

# PHASE 3: FEATURE ANALYSIS & INTERPRETATION
# Correlation heatmap
plt.figure(figsize=(8,6))
sns.heatmap(df[features + [target]].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# PLots for relationships
for col in features:
    plt.figure()
    sns.scatterplot(x=df[col], y=df[target])
    plt.title(f"{col} vs {target}")
    plt.show()

# PHASE 4: REPORTING
# Save predictions/results
results = pd.DataFrame({'True': y_test, 'Predicted': y_pred})
results.to_csv('svm_predictions.csv', index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'deforestation_dataset.csv'

In [None]:
Project Report Structure
1. Introduction
Brief context on global deforestation issues.

Objectives: Predict and analyze factors influencing deforestation using SVM.

2. Data Preprocessing
Missing value strategy: Imputed numerics with median, removed rows with categorical NA.

Categorical variables encoded using label encoding.

Standardized numerical features for SVM compatibility.

3. Model Development & Evaluation
Trained SVM with linear kernel to predict Tree_Cover_Loss_percent.

Metrics:

MAE: [add value]

MSE: [add value]

RMSE: [add value]

R^2: [add value]

Hyperparameter tuning improved performance ([add if relevant]).

4. Feature Importance & Analysis
Top features influencing deforestation (list from plot).

Correlation analysis shows [highlight relationships].

GDP, population, and policy strictness emerged as key factors.

5. Visualization Highlights
Show/attach plots: Feature importance, heatmaps, scatterplots.

6. Key Findings & Insights
Deforestation is most strongly influenced by [list top features with interpretations].

Countries with high GDP but weak policy enforcement see higher loss.

Rainfall and population trends correlate with forest loss.

7. Recommendations
Enforce stricter policies, address corruption, and promote sustainable practices.

Invest in education and alternatives for communities dependent on logging.

Monitor trends using predictive analytics for proactive intervention.

8. Conclusion
SVM models can uncover important trends and support better policy decisions for deforestation mitigation.

Deliverables:

Python script or Jupyter Notebook

Report (fill-in the sections above with your outcomes and visuals)

Attach svm_predictions.csv as sample output

To Run This Project:

Ensure you have your dataset in your environment

Copy the notebook or script code, run each cell, and paste metric/visuals values/output in the report template

 