<a href="https://colab.research.google.com/github/MichaelMcCarey/WildfiresProject/blob/main/WildfiresProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import pandas as pd
import numpy as np

# importing the wildfires datasets i put in github
training = 'https://raw.githubusercontent.com/MichaelMcCarey/WildfiresProject/main/wildfires_training.csv'
training_data = pd.read_csv(training)
test = 'https://raw.githubusercontent.com/MichaelMcCarey/WildfiresProject/main/wildfires_test.csv'
test_data = pd.read_csv(test)

In [2]:
# importing algorithims from sci-kit learn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics

In [3]:
# test to make sure data was imported successfully
print(training_data.head())
print(test_data.head())
print(training_data.shape)
print(test_data.shape)

  fire  year  temp  humidity  rainfall  drought_code  buildup_index  day  \
0   no  2015    28        59       0.0          8.06           3.47    1   
1   no  2010    30        61       1.3          8.17           4.03    2   
2   no  2009    26        83      13.1          8.08           3.59    3   
3   no  2017    25        87       2.5          7.18           2.42    4   
4   no  2014    28        77       0.0         14.98           4.63    5   

   month  wind_speed  
0      6          19  
1      6          13  
2      6          22  
3      6          15  
4      6          18  
  fire  year  temp  humidity  rainfall  drought_code  buildup_index  day  \
0   no  2015    33        68       4.5          9.12           5.09   19   
1  yes  2009    28        56       0.0         38.17          21.21   12   
2   no  2017    30        64       0.6         15.38           6.24   24   
3   no  2007    23        74       8.3          7.36           2.27   14   
4   no  2017    31       

In [4]:
# seperate the independent and dependent variables
independent_cols = ['year', 'temp', 'humidity', 'rainfall', 'drought_code', 'buildup_index', 'day', 'month', 'wind_speed']
dependent_cols = ['fire']

In [16]:
# set up matrix x and vector y for the training data
X_training = training_data[independent_cols]
y_training = training_data[dependent_cols]

In [17]:
# set up matrix x and vector y for the test data
X_test = test_data[independent_cols]
y_test = test_data[dependent_cols]

In [18]:
# test to make sure matrix is set up successfully
print(X_training.shape)
print(X_test.shape)
print(y_training.shape)
print(y_test.shape)

(154, 9)
(50, 9)
(154, 1)
(50, 1)


In [19]:
# creating logistic regression model with default settings
model = LogisticRegression()
model.fit(X_training, y_training)

# predictions
predictions_training = model.predict(X_training)
predictions_test = model.predict(X_test)

# accuracy on the predictions
accuracy_training = metrics.accuracy_score(y_training, predictions_training)
accuracy_test = metrics.accuracy_score(y_test, predictions_test)

print('Accuracy on training data: ', accuracy_training)
print('Accuracy on test data: ', accuracy_test)

Accuracy on training data:  0.8961038961038961
Accuracy on test data:  0.88


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [71]:
# since i am getting a message about changing the shape of y, and also being told to increase the max_iter, i will be changing it as suggested and using the new version as the actual default accuracy
y_training = y_training.ravel()
y_test = y_test.ravel()

#new model fixed
model = LogisticRegression(max_iter=10000) # this is high but it finally got rid of the message for all future code
model.fit(X_training, y_training)

# predictions
predictions_training = model.predict(X_training)
predictions_test = model.predict(X_test)

# accuracy on the predictions
accuracy_training = metrics.accuracy_score(y_training, predictions_training)
accuracy_test = metrics.accuracy_score(y_test, predictions_test)

print(f'Accuracy on training data:  {accuracy_training:.3f}') # using this to limit accuracy results to 3 decimals for neatness
print('Accuracy on test data: ', accuracy_test)

Accuracy on training data:  0.909
Accuracy on test data:  0.9


In [73]:
C_values = [0.01, 0.1, 0.5, 1, 5, 10, 100] # chose wide range for C to explore how it effects accuracy, noticed best results came from around 1 so i added 0.5 and 5 later to see if they gave even better results
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'] # the options for solvers from sci kit

# this loop will print out a list of how each combination of the hyperparameters performed
for c in C_values:
  for solver in solvers:
        model = LogisticRegression(C=c, solver=solver, max_iter=10000) # max_iter not a hyperparameter since i am keeping it consistent with my default logistic regression
        model.fit(X_training, y_training)
        accuracy_training = metrics.accuracy_score(y_training, model.predict(X_training))
        accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test))
        print(f"C={c}, solver='{solver}', Accuracy on training data: {accuracy_training:.3f}, Accuracy on test data: {accuracy_test:.3f}")

C=0.01, solver='lbfgs', Accuracy on training data: 0.883, Accuracy on test data: 0.840
C=0.01, solver='liblinear', Accuracy on training data: 0.883, Accuracy on test data: 0.840
C=0.01, solver='newton-cg', Accuracy on training data: 0.883, Accuracy on test data: 0.840
C=0.01, solver='newton-cholesky', Accuracy on training data: 0.870, Accuracy on test data: 0.820
C=0.01, solver='sag', Accuracy on training data: 0.870, Accuracy on test data: 0.820
C=0.01, solver='saga', Accuracy on training data: 0.870, Accuracy on test data: 0.820
C=0.1, solver='lbfgs', Accuracy on training data: 0.896, Accuracy on test data: 0.820
C=0.1, solver='liblinear', Accuracy on training data: 0.896, Accuracy on test data: 0.820
C=0.1, solver='newton-cg', Accuracy on training data: 0.890, Accuracy on test data: 0.820
C=0.1, solver='newton-cholesky', Accuracy on training data: 0.883, Accuracy on test data: 0.820
C=0.1, solver='sag', Accuracy on training data: 0.883, Accuracy on test data: 0.820
C=0.1, solver='sa