In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
data=pd.read_csv('forestfires.csv')

In [18]:
data.isnull().sum().sort_values(ascending=True)# No null values

month            0
monthsep         0
monthoct         0
monthnov         0
monthmay         0
monthmar         0
monthjun         0
monthjul         0
monthjan         0
monthfeb         0
monthdec         0
monthaug         0
monthapr         0
daywed           0
daytue           0
daythu           0
daysun           0
daysat           0
daymon           0
dayfri           0
area             0
rain             0
wind             0
RH               0
temp             0
ISI              0
DC               0
DMC              0
FFMC             0
day              0
size_category    0
siz              0
dtype: int64

In [14]:
data['siz'] = data['size_category'].str.strip().map({'small': 0, 'large': 1})

In [15]:
data['siz']

0      0
1      0
2      0
3      0
4      0
      ..
512    1
513    1
514    1
515    0
516    0
Name: siz, Length: 517, dtype: int64

# Building Models and preprocessing the data using Scaling and OHE


In [25]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Map 'Small' and 'Large' to 0 and 1
data['siz'] = data['size_category'].map({'small': 0, 'large': 1})

# Select features and target variable
X = data.drop(['size_category', 'siz'], axis=1)  # Remove 'size_category' and 'siz' columns
y = data['siz']

# Handle missing values in the target variable (y)
y = y.dropna()

# Check the number of samples before the split
print("Number of samples in X:", len(X))
print("Number of samples in y:", len(y))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)#using stratify to reduce bias during training

# Check the number of samples after the split
print("Number of samples in X_train:", len(X_train))
print("Number of samples in X_test:", len(X_test))
print("Number of samples in y_train:", len(y_train))
print("Number of samples in y_test:", len(y_test))

# Define numeric and categorical features
numeric_features = X.select_dtypes(include=['int', 'float']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Apply preprocessing steps consistently to both training and testing sets
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False,handle_unknown='ignore'), ['month']),
    ])

# Impute missing values if any
imputer = SimpleImputer(strategy='mean')  
X_train[numeric_features] = imputer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = imputer.transform(X_test[numeric_features])

# Standardize and one-hot encode the features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)







Number of samples in X: 517
Number of samples in y: 517
Number of samples in X_train: 413
Number of samples in X_test: 104
Number of samples in y_train: 413
Number of samples in y_test: 104




# Create and train the SVM model using different kernels

## Linear Kernel

In [None]:

svm_model = SVC(kernel='linear')  # Using Linear kernel
svm_model.fit(X_train_processed, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_processed)

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.875
Precision: 1.0
Recall: 0.5357142857142857
F1-score: 0.6976744186046512

Confusion Matrix:
 [[76  0]
 [13 15]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92        76
           1       1.00      0.54      0.70        28

    accuracy                           0.88       104
   macro avg       0.93      0.77      0.81       104
weighted avg       0.89      0.88      0.86       104



## Sigmoid kernel

In [33]:

svm_model = SVC(kernel='sigmoid')  # Using sigmoid kernel gives similar results as linear model so the final would still be any of them
svm_model.fit(X_train_processed, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_processed)


In [34]:
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.7884615384615384
Precision: 1.0
Recall: 0.21428571428571427
F1-score: 0.35294117647058826

Confusion Matrix:
 [[76  0]
 [22  6]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.87        76
           1       1.00      0.21      0.35        28

    accuracy                           0.79       104
   macro avg       0.89      0.61      0.61       104
weighted avg       0.84      0.79      0.73       104



## RBF or Gaussian kernel

In [35]:

svm_model = SVC(kernel='rbf')  # Using rbf kernel gives similar results as linear model so the final would still be any of them
svm_model.fit(X_train_processed, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_processed)


In [36]:
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.7980769230769231
Precision: 1.0
Recall: 0.25
F1-score: 0.4

Confusion Matrix:
 [[76  0]
 [21  7]]

Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88        76
           1       1.00      0.25      0.40        28

    accuracy                           0.80       104
   macro avg       0.89      0.62      0.64       104
weighted avg       0.84      0.80      0.75       104



# Using Grid Search CV to find best parameters

In [37]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
}

# Create an SVM model
svm_model = SVC()

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train_processed, y_train)

# Print the best parameters
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_svm_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred = best_svm_model.predict(X_test_processed)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}


In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)

Accuracy: 0.9423076923076923
Precision: 0.9583333333333334
Recall: 0.8214285714285714
F1-score: 0.8846153846153847

Confusion Matrix:
 [[75  1]
 [ 5 23]]

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96        76
           1       0.96      0.82      0.88        28

    accuracy                           0.94       104
   macro avg       0.95      0.90      0.92       104
weighted avg       0.94      0.94      0.94       104



   ### Accuracy: The overall accuracy of the model has increased, indicating a higher proportion of correctly classified instances.
   ### Precision: The positive class (1) precision is still high at 0.9583, indicating that when the model predicts positive, it is correct most of the time.
   ###  Recall: The positive class recall has substantially increased to 0.8214, meaning the model is now identifying a larger proportion of the actual positive instances.
   ### F1-score: The positive class F1-score has increased, indicating a better balance between precision and recall.