In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import io
import os
from PIL import Image
import pipeline_utilities as p_util
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Activation, Dropout, BatchNormalization


In [2]:
dir = '../Project_3/Resources/valid/'
X_valid = []
y_valid = []
for direct in os.listdir(dir):
    print("Loading validation dataset {}".format(direct))
    for filename in os.listdir(os.path.join(dir,direct)):
        img_path = os.path.join(dir,direct,filename)
#        print(img_path)
        img = Image.open(img_path)
        img = img.resize((32,32))
        img = np.array(img)
        img = img/255
        X_valid.append(img)
        if direct == 'wildfire':
            y_valid.append(1)
        else:
            y_valid.append(0)

Loading validation dataset nowildfire
Loading validation dataset wildfire


In [3]:
dir = '../Project_3/Resources/test/'
X_test = []
y_test = []
for direct in os.listdir(dir):
    print("Loading testing dataset {}".format(direct))
    for filename in os.listdir(os.path.join(dir,direct)):
        img_path = os.path.join(dir,direct,filename)
#        print(img_path)
        img = Image.open(img_path)
        img = img.resize((32,32))
        img = np.array(img)
        img = img/255
        X_test.append(img)
        if direct == 'wildfire':
            y_test.append(1)
        else:
            y_test.append(0)

Loading testing dataset nowildfire
Loading testing dataset wildfire


In [4]:
dir = '../Project_3/Resources/train/'
X_train = []
y_train = []
for direct in os.listdir(dir):
    print("Loading training dataset {}".format(direct))
    for filename in os.listdir(os.path.join(dir,direct)):
        img_path = os.path.join(dir,direct,filename)
#        print(img_path)
        img = Image.open(img_path)
        try:
            img = img.resize((32,32))
            img = np.array(img)
            img = img/255
            X_train.append(img)
            if direct == 'wildfire':
                y_train.append(1)
            else:
                y_train.append(0)
        except:
            print(img_path)

Loading training dataset nowildfire
../Project_3/Resources/train/nowildfire\-114.152378,51.027198.jpg
Loading training dataset wildfire


In [5]:
X_train = np.array(X_train)
X_test = np.array(X_test)
X_valid = np.array(X_valid)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_valid = np.array(y_valid)
X_train.shape

(30249, 32, 32, 3)

In [6]:
# Step 1: Build the CNN model
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Conv2D(256, (3, 3), activation='relu', padding='same'),
    Conv2D(256, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    
    Flatten(),
    
    Dense(128, activation='relu'),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    BatchNormalization(),
    
    Dense(1, activation='sigmoid')  # Output layer with number of classes
])

# Step 2: Compile the model
cnn_model.compile(optimizer='adam',
              loss='binary_crossentropy',  # Loss function for multi-class classification
              metrics=['accuracy'])

cnn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [7]:
# Train the model
batch_size = 32
epochs = 5
history = cnn_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=epochs
)

Epoch 1/5
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 229ms/step - accuracy: 0.8755 - loss: 0.3040 - val_accuracy: 0.9302 - val_loss: 0.1732
Epoch 2/5
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 225ms/step - accuracy: 0.9226 - loss: 0.1957 - val_accuracy: 0.9173 - val_loss: 0.2291
Epoch 3/5
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 226ms/step - accuracy: 0.9286 - loss: 0.1843 - val_accuracy: 0.9376 - val_loss: 0.1976
Epoch 4/5
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 224ms/step - accuracy: 0.9351 - loss: 0.1740 - val_accuracy: 0.9341 - val_loss: 0.1870
Epoch 5/5
[1m946/946[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m212s[0m 224ms/step - accuracy: 0.9389 - loss: 0.1631 - val_accuracy: 0.9314 - val_loss: 0.1759


In [8]:
cnn_model.evaluate(X_test, y_test)

[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 62ms/step - accuracy: 0.8988 - loss: 0.2381


[0.15787330269813538, 0.9357040524482727]

In [9]:
X_train_flatten = X_train.reshape(X_train.shape[0], -1)
X_test_flatten = X_test.reshape(X_test.shape[0], -1)
X_valid_flatten = X_valid.reshape(X_valid.shape[0], -1)
print(X_train_flatten.shape)
print(X_test_flatten.shape)
print(X_valid_flatten.shape)
print(y_train.shape)
print(y_test.shape)
print(y_valid.shape)


(30249, 3072)
(6299, 3072)
(6300, 3072)
(30249,)
(6299,)
(6300,)


In [10]:
# Create and fit a Logistic Regression model
random_state = 1
p_util.logistic_regression_model_generator(X_train_flatten, X_test_flatten, y_train, y_test, random_state)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Training Data Score: 0.8973519785778042
Logistic Regression Testing Data Score: 0.8866486743927607
Logistic Regression Predictions Accuracy Score: 0.8866486743927607
              precision    recall  f1-score   support

           1       0.90      0.90      0.90      3479
           0       0.87      0.87      0.87      2820

    accuracy                           0.89      6299
   macro avg       0.89      0.89      0.89      6299
weighted avg       0.89      0.89      0.89      6299

Logistic Regression Balanced Accuracy Score: 0.8853606950721553
Logistic Regression roc_auc_score: 0.9523178325379856


In [11]:
# Trying a random forest model on wildfire image data
random_state = 1
n_estimators = 100
p_util.random_forest_model_generator(X_train_flatten, X_test_flatten, y_train, y_test, random_state, n_estimators)


Random Forest Training Data Score: 0.9999669410559027
Random Forest Testing Data Score: 0.9107794888077473
Random Forest Predictions Accuracy Score: 0.9107794888077473
              precision    recall  f1-score   support

           1       0.94      0.89      0.92      3479
           0       0.88      0.93      0.90      2820

    accuracy                           0.91      6299
   macro avg       0.91      0.91      0.91      6299
weighted avg       0.91      0.91      0.91      6299

Random Forest Balanced Accuracy Score: 0.9129155887707195


In [12]:
# Trying a decision tree model
p_util.decision_tree_model_generator(X_train_flatten, X_test_flatten, y_train, y_test)

Decision Tree Training Data Score: 1.0
Decision Tree Testing Data Score: 0.8402921098587077
Decision Tree Predictions Accuracy Score: 0.8402921098587077
              precision    recall  f1-score   support

           1       0.87      0.84      0.85      3479
           0       0.81      0.84      0.83      2820

    accuracy                           0.84      6299
   macro avg       0.84      0.84      0.84      6299
weighted avg       0.84      0.84      0.84      6299

Decision Tree Balanced Accuracy Score: 0.8405062594411454


In [13]:
random_state = 1
p_util.extra_trees_model_generator(X_valid_flatten, X_test_flatten, y_valid, y_test, random_state)

Extra Trees Training Data Score: 1.0
Extra Trees Testing Data Score: 0.9115732655977139
Extra Trees Predictions Accuracy Score: 0.9115732655977139
              precision    recall  f1-score   support

           1       0.92      0.92      0.92      3479
           0       0.90      0.90      0.90      2820

    accuracy                           0.91      6299
   macro avg       0.91      0.91      0.91      6299
weighted avg       0.91      0.91      0.91      6299

Extra Trees Balanced Accuracy Score: 0.9104099775960729


In [7]:
%%time
random_state = 1
p_util.ada_boost_model_generator(X_train_flatten, X_test_flatten, y_train, y_test, random_state)

Ada Boosting Training Data Score: 0.8812192138583094
Ada Boosting Testing Data Score: 0.891093824416574
Ada Boosting Predictions Accuracy Score: 0.891093824416574
              precision    recall  f1-score   support

           1       0.91      0.89      0.90      3479
           0       0.87      0.89      0.88      2820

    accuracy                           0.89      6299
   macro avg       0.89      0.89      0.89      6299
weighted avg       0.89      0.89      0.89      6299

Ada Boosting Balanced Accuracy Score: 0.890996944177731
CPU times: total: 4min 38s
Wall time: 5min 50s


In [9]:
# This model took forever to run, changed train to valid data set
random_state = 1
p_util.gradient_boost_model_generator(X_valid_flatten, X_test_flatten, y_valid, y_test, random_state)

Gradient Boosting Training Data Score: 0.923633839135178
Gradient Boosting Testing Data Score: 0.9193522781393872
Gradient Boosting Predictions Accuracy Score: 0.9193522781393872
              precision    recall  f1-score   support

           1       0.94      0.91      0.93      3479
           0       0.89      0.93      0.91      2820

    accuracy                           0.92      6299
   macro avg       0.92      0.92      0.92      6299
weighted avg       0.92      0.92      0.92      6299

Gradient Boosting Balanced Accuracy Score: 0.920374170045603


In [10]:
%%time
kernel_type = 'linear'
p_util.svm_model_generator(X_valid_flatten, X_test_flatten, y_valid, y_test, kernel_type)

SVM Training Data Score: 0.9561904761904761
SVM Testing Data Score: 0.8490236545483411
SVM Predictions Accuracy Score: 0.8490236545483411
              precision    recall  f1-score   support

           1       0.84      0.90      0.87      3479
           0       0.86      0.79      0.82      2820

    accuracy                           0.85      6299
   macro avg       0.85      0.84      0.85      6299
weighted avg       0.85      0.85      0.85      6299

SVM Balanced Accuracy Score: 0.8433058329714864
CPU times: total: 1min 12s
Wall time: 1min 15s


In [16]:
# Section 3 - Get NYT articles on wildfires

# Import dependencies
import requests
import json
from dotenv import load_dotenv
from collections import Counter
import spacy
# Load the English language model for spaCy
nlp = spacy.load("en_core_web_sm")

In [17]:
# Load environment variables and New York Times API key
load_dotenv()
api_key = os.getenv("NYTIMES_API_KEY")
type(api_key)

str

In [18]:
# New York Times Article API URL
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?"

# Search for articles that mention granola
query = "wildfire"

In [19]:
# Build query URL
query_url = url + "api-key=" + api_key + "&q=" + query

In [20]:
# Request articles
articles = requests.get(query_url).json()

# The "response" property in articles contains the actual articles
# list comprehension.
articles_list = articles["response"]["docs"]
#print(json.dumps(articles_list, indent=4))

In [21]:
# Print the web_url of each stored article
texts = []
for article in articles_list:
    print(article["web_url"])
    texts.append(article["snippet"])
    texts.append(article["lead_paragraph"])

texts[0]

https://www.nytimes.com/2024/11/20/nyregion/new-york-wildfires-drought.html
https://www.nytimes.com/2024/11/24/opinion/wildfires-new-york-new-jersey-prescribed-burn.html
https://www.nytimes.com/2024/11/18/opinion/wildfires-new-jersey-new-york-climate-change.html
https://www.nytimes.com/video/weather/100000009824708/inwood-hill-new-york-fire.html
https://www.nytimes.com/video/weather/100000009818664/wildfires-northeast-us.html
https://www.nytimes.com/2024/11/11/nyregion/jennings-creek-wildfire-ny-nj.html
https://www.nytimes.com/2024/11/12/nyregion/jennings-creek-wildfire-nj-ny-wind.html
https://www.nytimes.com/2024/11/10/nyregion/orange-county-fire-sterling-forest.html
https://www.nytimes.com/video/us/100000009812041/california-wildfires.html
https://www.nytimes.com/2024/11/04/weather/california-wind-wildfires.html


'The New York region is unlikely to ever have as many brush fires as out West. But residents need to be ready for more droughts.'

In [22]:
# Use the most_common_adjs function to tokenize the text, creates a list of with all the adjectives, 
# and retrieve the most common adjectives and their frequency. 
def most_common_adjs(text):
    """
    Finds and returns the most common adjective in the given text.

    Args:
        text (str): The input text from which adjectives will be extracted.

    Returns:
        tuple: A tuple containing the most common adjectives and their frequency.
               The tuple has the format (adjective, frequency).

    Example:
    >>> text = "The quick brown fox jumps over the lazy dog. The fast fox is brown."
    >>> most_common_adj(text)
    ('brown', 2)
    """
    # Tokenizes text and parse each token
    doc = nlp(text)
    
    # Creates a list with all the adjectives in the text
    adjs = [token.text.lower() for token in doc if token.pos_ == 'ADJ']
    
    # Retrieves the most frequent adjective in the adjectives list using the Counter module
    try:
        most_common_adj = Counter(adjs).most_common(1)[0]
        return most_common_adj
    except:
        return


In [23]:
# Create a list of the most common adjective for wildfire articles
common_adjs = [most_common_adjs(text) for text in texts]

# Print the common adjectives.
print(common_adjs)

[('unlikely', 1), None, ('prescribed', 1), ('active', 1), ('hard', 1), None, None, None, ('dry', 1), ('dry', 1), None, ('vast', 1), ('rugged', 1), None, ('latest', 1), ('old', 1), ('strong', 1), ('strong', 1), ('multiple', 1), ('dangerous', 1)]


In [24]:
# Trying conversational memory
from langchain_google_genai import ChatGoogleGenerativeAI


In [25]:
# Load environment variables.
load_dotenv()

# Set the model name for our LLMs.
GEMINI_MODEL = "gemini-1.5-flash"

# Store the API key in a variable.
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [26]:
# Initialize the model.
llm = ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3)

# Define a query as a string.
query = "What are three common causes of wildfire?"

# Pass the query to the invoke method, and print the result.
result = llm.invoke(query)
print(result.content)


Three common causes of wildfires are:

1. **Lightning strikes:**  Natural ignition sources, especially during dry thunderstorms.
2. **Human carelessness:** This is the most common cause, encompassing things like discarded cigarettes, unattended campfires, equipment sparks (e.g., from chainsaws or power lines), and even carelessly tossed glass that can magnify sunlight.
3. **Arson:** Deliberately set fires, often for malicious reasons.



In [27]:
result = llm.invoke("What preventive precautions should be taken?")
print(result.content)

Preventive precautions depend heavily on *what* you're trying to prevent.  To give you helpful advice, I need more context.  Are you trying to prevent:

* **A specific illness or disease?** (e.g., the flu, COVID-19, heart disease, cancer)  Please specify the illness.
* **An accident or injury?** (e.g., a car accident, a fall, a fire) Please specify the type of accident.
* **Damage to property?** (e.g., theft, flooding, fire) Please specify the type of damage.
* **Financial problems?** (e.g., debt, job loss) Please specify the financial concern.
* **Something else entirely?** Please describe the situation.


Once you provide more information, I can offer specific and relevant preventive precautions.



In [28]:
result = llm.invoke("Based on the risk of wildfire occurring in a specific region, what marketing opportunties exist?")
print(result.content)

The risk of wildfire in a specific region presents several marketing opportunities, depending on the target audience and the type of business.  These opportunities can be broadly categorized into:

**1.  Mitigation and Prevention:**

* **Home Improvement & Construction:**  Marketing fire-resistant roofing materials, landscaping services (creating defensible space), window and door upgrades, sprinkler systems, and other home hardening products and services.  Focus on safety and peace of mind.
* **Insurance:**  Marketing wildfire insurance policies, emphasizing coverage for property damage, liability, and loss of use. Highlight the specific risks in the region and the tailored protection offered.
* **Government & Community Programs:**  Marketing government grants, subsidies, and community wildfire preparedness programs.  Focus on accessibility and ease of application.


**2.  Response and Recovery:**

* **Emergency Preparedness Supplies:**  Marketing emergency kits, generators, water pur

In [23]:
# Additional imports for conversational memory.
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory

In [24]:
# Initialize the model.
llm = ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model=GEMINI_MODEL, temperature=0.3)

# Initialize an object for conversational memory.
buffer = ConversationBufferMemory()

# Create the chain for conversation, using a ConversationBufferMemory object.
conversation = ConversationChain(llm=llm, verbose=True, memory=buffer)

# Define a query as a string.
query = "What are three common causes of wildfire?"

# Pass the query to the predict method and print the result.
result = conversation.predict(input=query)
print(result)

print()

# Define a query as a string.
query = "What preventive measures should be taken?"

# Pass the query to the predict method, and print the result.
result = conversation.predict(input=query)
print(result)

  buffer = ConversationBufferMemory()
  conversation = ConversationChain(llm=llm, verbose=True, memory=buffer)




[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Current conversation:

Human: What are three common causes of wildfire?
AI:[0m

[1m> Finished chain.[0m
Human: What are three common causes of wildfire?

AI:  Three common causes of wildfires are lightning strikes, human negligence, and equipment malfunctions.

Let's break those down a bit further:

* **Lightning Strikes:**  These are a natural cause, often occurring during dry thunderstorms where little or no rain reaches the ground. The intense heat of a lightning bolt can ignite dry brush, grass, and trees, especially in areas with abundant flammable material.  The exact mechanism involves the superheated air rapidly expanding and igniting nearby combustible matte