# **Loading the data**

In [1]:
# Read data from /Users/grantrobinett/2024/coding/txdot_interpreted_fields/test_improvements/dataLocal/crash_data_1.csv
import pandas as pd

# Read only the first 100,000 rows, avoiding encoding errors
try:
    raw_data_1 = pd.read_csv('./dataLocal/crash_data_1.csv', encoding='windows-1252', skiprows=0, nrows=100000, low_memory=False)
except UnicodeDecodeError:
    print("Error: Could not read the CSV file with encoding 'windows-1252'")


# List of columns to be predicted and therefore removed from the dataset
columns_to_predict = [
    "Road_Relat_ID", "Intrsct_Relat_ID", "Road_Cls_ID", "Harm_Evnt_ID",
    "FHE_Collsn_ID", "Obj_Struck_ID", "Phys_Featr_1_ID", "Phys_Featr_2_ID",
    "Bridge_Detail_ID", "Othr_Factr_ID", "Road_Part_Adj_ID", "Investigator_Narrative"
]

# Drop the columns to be predicted from the dataset
data_features = raw_data_1.drop(columns=columns_to_predict)

## **Checking the data**

In [62]:
# Check if the dataset has any columns from the list of columns to predict, and print the result
print("Does the dataset have any columns from the list of columns to predict?")
print(any(column in columns_to_predict for column in data_features.columns))

# Print size of the dataset
print("\nSize of the dataset:")
print(data_features.shape)

# Print type of each column in the dataset
print("\nType of each column in the dataset:")
print(data_features.dtypes)

# Print types of columns in the dataset (unique types)
print("\nUnique types of columns in the dataset:")
print(data_features.dtypes.unique())


Does the dataset have any columns from the list of columns to predict?
False

Size of the dataset:
(100000, 162)

Type of each column in the dataset:
Crash_ID                   int64
Crash_Fatal_Fl            object
Cmv_Involv_Fl             object
Schl_Bus_Fl               object
Rr_Relat_Fl               object
                          ...   
Investigat_Service_ID    float64
Investigat_DA_ID         float64
Damage_1                  object
Damage_2                  object
Damage_3                  object
Length: 162, dtype: object

Unique types of columns in the dataset:
[dtype('int64') dtype('O') dtype('float64')]


**More thorough look**

In [63]:
# Print all float column names in the dataset
print(data_features.select_dtypes(include=['float64']).columns)
# Print number of float columns in the dataset
print(len(data_features.select_dtypes(include=['float64']).columns))

# Print all integer column names in the dataset
print(data_features.select_dtypes(include=['int64']).columns)
# Print number of integer columns in the dataset
print(len(data_features.select_dtypes(include=['int64']).columns))

# Print all object column names in the dataset
print(data_features.select_dtypes(include=['object']).columns)
# Print number of object columns in the dataset
print(len(data_features.select_dtypes(include=['object']).columns))

# Print summary statistics of the dataset
print(data_features.describe())

Index(['Rpt_Latitude', 'Rpt_Longitude', 'Rpt_Sec_Rdwy_Sys_ID',
       'Rpt_Sec_Road_Part_ID', 'Rpt_Ref_Mark_Offset_Amt', 'Road_Type_ID',
       'Investigat_Area_ID', 'Investigat_District_ID', 'Investigat_Region_ID',
       'Latitude', 'Longitude', 'Dfo', 'Control', 'Section', 'Milepoint',
       'Ref_Mark_Displ', 'Street_Nbr_2', 'Control_2', 'Section_2',
       'Milepoint_2', 'Hwy_Dsgn_Lane_ID', 'Hwy_Dsgn_Hrt_ID', 'Hp_Shldr_Left',
       'Hp_Shldr_Right', 'Hp_Median_Width', 'Base_Type_ID', 'Nbr_Of_Lane',
       'Row_Width_Usual', 'Roadbed_Width', 'Surf_Width', 'Surf_Type_ID',
       'Curb_Type_Left_ID', 'Curb_Type_Right_ID', 'Shldr_Type_Left_ID',
       'Shldr_Width_Left', 'Shldr_Use_Left_ID', 'Shldr_Type_Right_ID',
       'Shldr_Width_Right', 'Shldr_Use_Right_ID', 'Median_Type_ID',
       'Median_Width', 'Rural_Urban_Type_ID', 'Func_Sys_ID', 'Adt_Curnt_Amt',
       'Adt_Curnt_Year', 'Adt_Adj_Curnt_Amt', 'Pct_Single_Trk_Adt',
       'Pct_Combo_Trk_Adt', 'Trk_Aadt_Pct', 'Curve_Type_ID',

### *At least the values come in pretty good formats, I do not see any columns that have incorrect types at first glance. Let's check for missing data*

In [64]:
# Check for missing values in the dataset
print("\nMissing values in the dataset:")
print(data_features.isnull().sum())

# Print columns with missing values in the dataset
print("\nColumns with missing values in the dataset:")
print(data_features.columns[data_features.isnull().any()])

# Print number of columns with missing values in the dataset
print("\nNumber of columns with missing values in the dataset:")
print(data_features.columns[data_features.isnull().any()].size)

# Print columns with missing values, with format "column_name: missing_value_count". Avoid elipses when printing all columns
pd.set_option('display.max_rows', None)
print("\nColumns with missing values and their missing value count:")
print(data_features.isnull().sum()[data_features.isnull().sum() > 0])
pd.reset_option('display.max_rows')


# Check for duplicate rows in the dataset
print("\nNumber of duplicate rows in the dataset:")
print(data_features.duplicated().sum())


Missing values in the dataset:
Crash_ID                     0
Crash_Fatal_Fl               0
Cmv_Involv_Fl                0
Schl_Bus_Fl                  0
Rr_Relat_Fl                  0
                         ...  
Investigat_Service_ID     5690
Investigat_DA_ID         65394
Damage_1                 85968
Damage_2                 98391
Damage_3                 99747
Length: 162, dtype: int64

Columns with missing values in the dataset:
Index(['Case_ID', 'Local_Use', 'Rpt_Latitude', 'Rpt_Longitude', 'Rpt_Hwy_Num',
       'Rpt_Hwy_Sfx', 'Rpt_Block_Num', 'Rpt_Street_Pfx', 'Rpt_Street_Name',
       'Rpt_Street_Sfx',
       ...
       'Poscrossing_ID', 'WDCode_ID', 'Standstop', 'Yield', 'MPO_ID',
       'Investigat_Service_ID', 'Investigat_DA_ID', 'Damage_1', 'Damage_2',
       'Damage_3'],
      dtype='object', length=113)

Number of columns with missing values in the dataset:
113

Columns with missing values and their missing value count:
Case_ID                         7670
Local_Use

### *As we can see, there are some columns with missing values for every row loaded into the dataset. I'm going to make the executive decision to drop these*

In [2]:
# Drop columns with any missing values
data_features_no_missing = data_features.dropna(axis=1)

# Print size of the dataset after dropping columns with missing values
print("\nSize of the dataset after dropping columns with missing values:")
print(data_features_no_missing.shape)


Size of the dataset after dropping columns with missing values:
(100000, 49)


In [29]:
# # # Drop columns with missing values = 100,000. Only drop columns with exactly 100,000 missing values. Save the result to a new dataset
# data_features_dropped = data_features.dropna(axis=1, how='all')

# # Print size of the new dataset after dropping columns with missing values = 100,000
# print("\nSize of the dataset after dropping columns with all missing values:")
# print(data_features_dropped.shape)

# # Check for missing values in the new dataset
# print("\nMissing values in the new dataset:")
# print(data_features_dropped.isnull().sum())

# # Print columns with missing values in the new dataset
# print("\nColumns with missing values in the new dataset:")
# print(data_features_dropped.columns[data_features_dropped.isnull().any()])
# # Print number of columns with missing values in the new dataset
# print("\nNumber of columns with missing values in the new dataset:")
# print(data_features_dropped.columns[data_features_dropped.isnull().any()].size)


Size of the dataset after dropping columns with all missing values:
(100000, 133)

Missing values in the new dataset:
Crash_ID                     0
Crash_Fatal_Fl               0
Cmv_Involv_Fl                0
Schl_Bus_Fl                  0
Rr_Relat_Fl                  0
                         ...  
Investigat_Service_ID     5690
Investigat_DA_ID         65394
Damage_1                 85968
Damage_2                 98391
Damage_3                 99747
Length: 133, dtype: int64

Columns with missing values in the new dataset:
Index(['Case_ID', 'Local_Use', 'Rpt_Latitude', 'Rpt_Longitude', 'Rpt_Hwy_Num',
       'Rpt_Hwy_Sfx', 'Rpt_Block_Num', 'Rpt_Street_Pfx', 'Rpt_Street_Name',
       'Rpt_Street_Sfx', 'Rpt_Street_Desc', 'Rpt_Sec_Rdwy_Sys_ID',
       'Rpt_Sec_Hwy_Num', 'Rpt_Sec_Hwy_Sfx', 'Rpt_Sec_Road_Part_ID',
       'Rpt_Sec_Block_Num', 'Rpt_Sec_Street_Pfx', 'Rpt_Sec_Street_Name',
       'Rpt_Sec_Street_Sfx', 'Rpt_Ref_Mark_Offset_Amt',
       'Rpt_Ref_Mark_Dist_Uom', 'Rpt_Ref_Mark

### *With the worst NA columns gone, we can go to processing numerical and categorical data*

In [3]:
# Normalize numerical columns in the dataset to be between 0 and 1
data_features_normalized = data_features_no_missing.copy()
numerical_columns = data_features_normalized.select_dtypes(include=['float64', 'int64']).columns
data_features_normalized[numerical_columns] = (data_features_normalized[numerical_columns] - data_features_normalized[numerical_columns].min()) / (data_features_normalized[numerical_columns].max() - data_features_normalized[numerical_columns].min())

# Print summary statistics of the normalized dataset
print("\nSummary statistics of the normalized dataset:")
print(data_features_normalized.describe())


Summary statistics of the normalized dataset:
            Crash_ID  Rpt_CRIS_Cnty_ID    Rpt_City_ID  Rpt_Rdwy_Sys_ID  \
count  100000.000000     100000.000000  100000.000000    100000.000000   
mean        0.097667          0.424156       0.218259         0.588024   
std         0.071995          0.274482       0.388937         0.439538   
min         0.000000          0.000000       0.000000         0.000000   
25%         0.051605          0.221344       0.014403         0.111111   
50%         0.090245          0.395257       0.025805         0.777778   
75%         0.128279          0.632411       0.044109         1.000000   
max         1.000000          1.000000       1.000000         1.000000   

       Rpt_Road_Part_ID  Crash_Speed_Limit   Wthr_Cond_ID  Light_Cond_ID  \
count     100000.000000      100000.000000  100000.000000  100000.000000   
mean           0.062915           0.500487       0.848073       0.250896   
std            0.209654           0.202539       0.232913 

In [4]:
# Encode categorical columns in the dataset using one-hot encoding
# Print size of the dataset before one-hot encoding
print("\nSize of the dataset before one-hot encoding:")
print(data_features_normalized.shape)
data_features_encoded = pd.get_dummies(data_features_normalized)   # One-hot encoding
# Print size of the dataset after one-hot encoding
print("\nSize of the dataset after one-hot encoding:")
print(data_features_encoded.shape)


Size of the dataset before one-hot encoding:
(100000, 49)



Size of the dataset after one-hot encoding:
(100000, 4981)


### *For now, basic normalization and basic one-hot encoding has beed done. Let's see how models do.*

## **Splitting the Data**

In [5]:
# Print the size of the dataset
print("\nSize of the dataset:")
print(data_features_encoded.shape)


Size of the dataset:
(100000, 4981)


In [6]:
# To train our first model, we will be predicting the column "Road_Cls_ID"
# We will split the dataset into features (X) and target (y)

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Assuming `raw_data_1` is your original dataframe and contains the "Road_Cls_ID" column.

# Encode the categorical target variable
label_encoder = LabelEncoder()
# Adjust labels to zero-based if originally one-based
adjusted_labels = raw_data_1["Road_Cls_ID"] - 1
encoded_target = label_encoder.fit_transform(adjusted_labels)
encoded_target = to_categorical(encoded_target)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    data_features_encoded,  # this is the data after one-hot encoding the features
    encoded_target,  # this is your one-hot encoded target variable
    test_size=0.2,   # usually, 20% of the data is used for testing
    stratify=encoded_target,  # this is used to ensure that the distribution of classes in the train and test sets are similar
    random_state=42  # seed for reproducibility of results
)

# Now you can use `X_train`, `y_train` for training the model and `X_test`, `y_test` for evaluating it.

2024-05-12 14:45:21.408022: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [86]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

# Define the model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))  # input layer and first hidden layer
model.add(Dense(128, activation='relu'))  # second hidden layer
model.add(Dropout(0.5))  # Dropout
model.add(Dense(64, activation='relu'))  # third hidden layer
model.add(Dropout(0.5))  # Dropout
model.add(Dense(32, activation='relu'))  # fourth hidden layer
model.add(Dense(y_train.shape[1], activation='softmax'))  # output layer

# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50,  # Increased epochs due to early stopping
                    batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Print model metrics
print("\nModel evaluation:")
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50

Model evaluation:
Loss: 0.38776895403862
Accuracy: 0.8810499906539917


In [87]:
# # Save the model
# model.save('models/ANN_model_v1.keras')

# Saving as tf
model.save('models/ANN_v2', save_format='tf')

INFO:tensorflow:Assets written to: models/ANN_v2/assets


INFO:tensorflow:Assets written to: models/ANN_v2/assets


In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

# Define the model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))  # L2 regularization
model.add(BatchNormalization())  # Batch normalization
model.add(Dropout(0.5))  # Dropout

model.add(Dense(128, activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))  # L2 regularization
model.add(BatchNormalization())  # Batch normalization
model.add(Dropout(0.5))  # Dropout

model.add(Dense(64, activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))  # L2 regularization
model.add(BatchNormalization())  # Batch normalization
model.add(Dropout(0.5))  # Dropout

model.add(Dense(32, activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))  # L2 regularization
model.add(BatchNormalization())  # Batch normalization
model.add(Dropout(0.5))  # Dropout

# Output layer
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50,  # Increased epochs due to early stopping
                    batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Print model metrics
print("\nModel evaluation:")
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50

Model evaluation:
Loss: 0.7414984107017517
Accuracy: 0.7882000207901001


In [28]:
# Save the model
model.save('models/ANN_model_v2.keras')

In [30]:
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(hp.Int('input_units', min_value=32, max_value=256, step=32),
                           activation='relu', input_dim=X_train.shape[1]))
    
    for i in range(hp.Int('n_layers', 1, 4)):  # Number of hidden layers
        model.add(layers.Dense(hp.Int(f'dense_{i}_units', min_value=32, max_value=256, step=32),
                               activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout', min_value=0.0, max_value=0.5, default=0.25, step=0.05)))
    
    model.add(layers.Dense(y_train.shape[1], activation='softmax'))
    
    model.compile(
        optimizer=keras.optimizers.Adam(
            hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    
    return model

# Instantiate the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=20,  # The number of different hyperparameter combinations to try
    executions_per_trial=1,  # The number of models that should be built and fit for each trial
    directory='my_dir',  # Directory to store tuning logs
    project_name='keras_tuning')

# Display search space overview
tuner.search_space_summary()

# Perform the hyperparameter tuning
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hps)
model.fit(X_train, y_train, epochs=50, validation_split=0.2)

Trial 20 Complete [00h 05m 29s]
val_accuracy: 0.9089375138282776

Best val_accuracy So Far: 0.9100000262260437
Total elapsed time: 01h 11m 35s
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1400f84c0>

In [31]:
# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Save the best model to a file
best_model.save('tuning_1.keras')

In [5]:
# Import the necessary libraries
from tensorflow import keras
from tensorflow.keras.models import load_model

# Load the best model from a file
loaded_model = keras.models.load_model('models/tuning_1.keras')

# #print metrics
# print("\nModel evaluation:")
# loss, accuracy = loaded_model.evaluate(X_test, y_test)
# print(f"Loss: {loss}")
# print(f"Accuracy: {accuracy}")

# Print neural network summary
print("\nNeural network summary:")
print(loaded_model.summary())

loaded_model.save('models/tuned', save_format='tf')






Neural network summary:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 224)               1115968   
                                                                 
 dense_1 (Dense)             (None, 32)                7200      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 192)               6336      
                                                                 
 dropout_1 (Dropout)         (None, 192)               0         
                                                                 
 dense_3 (Dense)             (None, 128)               24704     
                                                                 
 dropout_2 (Dropout)         (N

INFO:tensorflow:Assets written to: models/tuned/assets


In [24]:
# Read data from /Users/grantrobinett/2024/coding/txdot_interpreted_fields/test_improvements/dataLocal/crash_data_1.csv
import pandas as pd

# Read only the first 100,000 rows, avoiding encoding errors
try:
    raw_data_1 = pd.read_csv('./dataLocal/crash_data_1.csv', encoding='windows-1252', skiprows=range(1, 100001), nrows=100, low_memory=False)
except UnicodeDecodeError:
    print("Error: Could not read the CSV file with encoding 'windows-1252'")


# List of columns to be predicted and therefore removed from the dataset
columns_to_predict = [
    "Road_Relat_ID", "Intrsct_Relat_ID", "Road_Cls_ID", "Harm_Evnt_ID",
    "FHE_Collsn_ID", "Obj_Struck_ID", "Phys_Featr_1_ID", "Phys_Featr_2_ID",
    "Bridge_Detail_ID", "Othr_Factr_ID", "Road_Part_Adj_ID", "Investigator_Narrative"
]

# Drop the columns to be predicted from the dataset
data_features = raw_data_1.drop(columns=columns_to_predict)

# Drop columns with any missing values
data_features_no_missing = data_features.dropna(axis=1)

# Print size of the dataset after dropping columns with missing values
print("\nSize of the dataset after dropping columns with missing values:")
print(data_features_no_missing.shape)

# Normalize numerical columns in the dataset to be between 0 and 1
data_features_normalized = data_features_no_missing.copy()
numerical_columns = data_features_normalized.select_dtypes(include=['float64', 'int64']).columns
data_features_normalized[numerical_columns] = (data_features_normalized[numerical_columns] - data_features_normalized[numerical_columns].min()) / (data_features_normalized[numerical_columns].max() - data_features_normalized[numerical_columns].min())

# Print summary statistics of the normalized dataset
print("\nSummary statistics of the normalized dataset:")
print(data_features_normalized.describe())

# Encode categorical columns in the dataset using one-hot encoding
# Print size of the dataset before one-hot encoding
print("\nSize of the dataset before one-hot encoding:")
print(data_features_normalized.shape)
data_features_encoded = pd.get_dummies(data_features_normalized)   # One-hot encoding
# Print size of the dataset after one-hot encoding
print("\nSize of the dataset after one-hot encoding:")
print(data_features_encoded.shape)

# Normalize the test data
test_features_normalized = data_features_encoded.copy()
test_features_normalized[numerical_columns] = (test_features_normalized[numerical_columns] - test_features_normalized[numerical_columns].min()) / (test_features_normalized[numerical_columns].max() - test_features_normalized[numerical_columns].min())

# One-hot encode the test data
test_features_encoded = pd.get_dummies(test_features_normalized)

# Print size of the test data
print("\nSize of the test data:")
print(test_features_encoded.shape)

# Save pandas dataframe
test_features_encoded.to_csv('dataLocal/test_data.csv', index=False)




Size of the dataset after dropping columns with missing values:
(100, 55)

Summary statistics of the normalized dataset:
         Crash_ID  Rpt_CRIS_Cnty_ID  Rpt_City_ID  Rpt_Rdwy_Sys_ID  \
count  100.000000        100.000000   100.000000       100.000000   
mean     0.152728          0.493420     0.088460         0.642222   
std      0.087821          0.399352     0.252361         0.439317   
min      0.000000          0.000000     0.000000         0.000000   
25%      0.146319          0.000000     0.000000         0.152778   
50%      0.146576          0.372294     0.019044         1.000000   
75%      0.146775          0.917749     0.035782         1.000000   
max      1.000000          1.000000     1.000000         1.000000   

       Rpt_Road_Part_ID  Crash_Speed_Limit  Rpt_Sec_Rdwy_Sys_ID  \
count        100.000000         100.000000           100.000000   
mean           0.040000           0.558553             0.913333   
std            0.130009           0.244825             

In [21]:
# Length of X_test
print(len(X_test))

20000


In [35]:
import pandas as pd
import requests
import json

# Print class labels
print("Class labels:")
print(label_encoder.classes_)

# Get first 10 rows of X_test
X_test_10 = X_test.head(100)

# Convert DataFrame to a list of lists for TensorFlow Serving
data = X_test_10.values.tolist()

# Save the data to a file
with open('dataLocal/request_data_raw.json', 'w') as f:
    json.dump(data, f)

# Format the POST request data
request_data = json.dumps({"instances": data})

# Export the request data to a file
with open('dataLocal/request_data.json', 'w') as f:
    f.write(request_data)

# Specify the TensorFlow Serving REST API endpoint
tf_serving_url = 'http://localhost:8501/v1/models/model:predict'  # Adjust the model name if necessary

# # Specify the TensorFlow Serving REST API endpoint
# tf_serving_url = "https://test-inference.herokuapp.com/v1/models/ANN_v2:predict"  # Adjust the model name if necessary

# Send the POST request
response = requests.post(tf_serving_url, data=request_data)

# Check the response
if response.status_code == 200:
    # Load the JSON response
    predictions = response.json()['predictions']
    # Do something with the predictions
    print(predictions)
else:
    print("Error:", response.status_code, response.text)

Class labels:
[0 1 2 3 4 5 6 7 8]
[[0.932618082, 0.0673819259, 1.19697754e-08, 1.29104189e-18, 3.36653622e-11, 2.6256805e-14, 3.69258306e-18, 6.09361921e-17, 3.08450522e-17], [0.0146303605, 0.0123871742, 0.00430872804, 9.00775485e-05, 0.967940629, 6.49367712e-06, 0.000626048248, 2.56044541e-06, 7.97442863e-06], [0.01033141, 0.978080273, 0.0115832603, 3.47133877e-09, 4.71151452e-06, 3.12500248e-07, 8.09354472e-10, 1.68631803e-10, 1.20626469e-11], [0.70582211, 0.294176251, 1.65792346e-06, 3.35033824e-15, 3.15312287e-09, 1.21390042e-11, 4.49851151e-15, 5.71287605e-14, 5.55513133e-15], [1.44981303e-20, 7.70826283e-24, 9.4596078e-25, 1.99813797e-21, 1.52110058e-14, 4.90598495e-19, 1.20928876e-24, 2.20192708e-31, 1.0], [0.942840159, 0.0571598373, 2.26926899e-09, 7.41073e-21, 1.48105616e-12, 5.59922183e-16, 2.40655565e-20, 7.21464242e-19, 1.68937229e-19], [0.0133731468, 0.011689947, 0.00236482709, 3.4065929e-06, 0.972421, 5.42751764e-07, 0.000146282298, 1.75549218e-07, 7.27250438e-07], [0.004

In [36]:
import numpy as np

# Convert the predictions into a numpy array for easier manipulation
predictions_array = np.array(predictions)

# Get the index of the max probability for each prediction
predicted_classes = np.argmax(predictions_array, axis=1)

# Do something with the predicted classes
print(predicted_classes)

[0 4 1 0 8 0 4 4 4 4 2 1 1 1 8 1 4 0 2 3 1 8 1 1 4 1 0 0 8 4 0 4 4 2 4 1 1
 2 0 4 0 4 1 4 4 0 1 4 2 2 4 0 4 4 8 1 8 4 4 0 4 1 0 0 1 2 4 1 8 3 8 3 2 1
 8 1 4 4 0 4 4 1 4 4 0 8 0 3 1 0 3 2 0 4 4 4 8 2 0 4]


In [37]:
# # Check first 10 values of y_test
# print(y_test[:100])
# Decode the one-hot encoded labels

# # Print y classes
# print(label_encoder.classes_)

from sklearn.metrics import accuracy_score, classification_report


# Decode the one-hot encoded labels
decoded_labels = label_encoder.inverse_transform(np.argmax(y_test[:100], axis=1))

# Calculate accuracy
accuracy = accuracy_score(decoded_labels, predicted_classes)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report
report = classification_report(decoded_labels, predicted_classes)
print(report)

Accuracy: 0.88
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        22
           1       0.81      0.77      0.79        22
           2       0.70      0.78      0.74         9
           3       0.80      0.80      0.80         5
           4       0.88      0.94      0.91        31
           8       1.00      1.00      1.00        11

    accuracy                           0.88       100
   macro avg       0.86      0.87      0.86       100
weighted avg       0.88      0.88      0.88       100



In [24]:
# Read data from /Users/grantrobinett/2024/coding/txdot_interpreted_fields/test_improvements/dataLocal/crash_data_1.csv and save 100,000 rows to a new file
import pandas as pd

# Read the first 100,000 rows of the dataset
try:
    raw_data_1 = pd.read_csv('./dataLocal/crash_data_1.csv', encoding='windows-1252', nrows=100000, low_memory=False)
except UnicodeDecodeError:
    print("Error: Could not read the CSV file with encoding 'windows-1252'")
    
# Save the first 100,000 rows to a new file
raw_data_1.to_csv('dataLocal/crash_data_100k.csv', index=False)

# Also save it as a XML file
raw_data_1.to_xml('dataLocal/crash_data_100k.xml')

In [26]:
# Read new XML file and load first 1000 rows to a pandas DataFrame
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('dataLocal/crash_data_100k.xml')
root = tree.getroot()

# Define an empty list to store the data
data = []

# Loop through each row in the XML file
for row in root:
    # Define an empty dictionary to store the row data
    row_data = {}
    # Loop through each column in the row
    for col in row:
        # Add the column name and value to the dictionary
        row_data[col.tag] = col.text
    # Append the row data to the list
    data.append(row_data)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Load the first 1000 rows of the DataFrame
df_1000 = df.head(1000)

df_1000.head()


Unnamed: 0,index,Crash_ID,Crash_Fatal_Fl,Cmv_Involv_Fl,Schl_Bus_Fl,Rr_Relat_Fl,Medical_Advisory_Fl,Amend_Supp_Fl,Active_School_Zone_Fl,Crash_Date,...,Unkn_Injry_Cnt,Tot_Injry_Cnt,Death_Cnt,MPO_ID,Investigat_Service_ID,Investigat_DA_ID,Damage_1,Damage_2,Damage_3,Investigator_Narrative
0,0,18696335,N,N,N,N,N,N,N,01/07/2022,...,0,0,0,282.0,45.0,,POWER POLE,,,UNIT 1 TRAVELING SOUTHBOUND ON FM 740 ATTEMPTE...
1,1,18688350,N,N,N,N,N,N,N,01/06/2022,...,0,0,0,282.0,21.0,4.0,,,,UNIT 1 WAS TURNING NB ONTO 1200 8TH AVE FROM ...
2,2,18688437,N,N,N,N,N,N,N,01/09/2022,...,0,0,0,,21.0,,,,,UNIT 2 UNIT 3 UNIT 4 AND UNIT 5 WERE PARKED AT...
3,3,18688481,N,N,N,N,N,N,N,01/07/2022,...,0,0,0,28.0,175.0,,,,,UNIT 2 DRIVER STATED THAT HE WAS TURNING INTO ...
4,4,18688480,N,N,N,N,N,N,N,01/10/2022,...,0,0,0,90.0,118.0,,,,,UNIT 2 WAS IN THE LEFT LANE TRAVELING SOUTH ON...


In [27]:
# Export the first 1000 rows of the DataFrame to a json file
df_1000.to_json('dataLocal/crash_data_1000.json', orient='records')

In [32]:
import requests
import json

url = 'http://localhost:5000/predict'
# Load data from test_improvements/dataLocal/crash_data_1000.json into json
data = df_1000.to_json(orient='records')
data = json.loads(data)


response = requests.post(url, json=data)

print('Status Code:', response.status_code)
print('Response:', response.json())

Status Code: 400
Response: {'error': '{\n    "error": "Matrix size-incompatible: In[0]: [1000,6056], In[1]: [4981,256]\\n\\t [[{{node StatefulPartitionedCall/StatefulPartitionedCall/sequential_2/dense_10/Relu}}]]"\n}'}
