**Task 1: Problem Identification**

**1.1 Read Data Description**

In [2]:
import pandas as pd

# Read metadata file to understand dataset
metadata = pd.read_excel("/content/LFB Metadata.xlsx")
metadata.head()


Unnamed: 0,Column,Sample record,Description,Extra Notes
0,IncidentNumber,000008-01012018,LFB Incident Number,
1,DateOfCall,2018-01-01 00:00:00,Date of 999 call,
2,CalYear,2018,Year of 999 call,
3,TimeOfCall,00:04:25,Time of 999 call,
4,HourOfCall,0,Hour of 999 call,


**1.2 Identify Business Problems**

In [3]:
# Identify meaningful business problems
business_problems = [
    "Predicting the severity of incidents based on various factors such as incident type, location, and time.",
    "Identifying locations with the highest incidence rates to prioritize resource allocation and preventive measures.",
    "Analyzing trends in incident types over time to identify emerging patterns and allocate resources effectively.",
    "Assessing the effectiveness of response time in minimizing property damage and casualties.",
    "Predicting the likelihood of false alarms to optimize emergency response resources."
]

# Print the identified business problems
print("Identified Business Problems:")
for idx, problem in enumerate(business_problems, start=1):
    print(f"{idx}. {problem}")


Identified Business Problems:
1. Predicting the severity of incidents based on various factors such as incident type, location, and time.
2. Identifying locations with the highest incidence rates to prioritize resource allocation and preventive measures.
3. Analyzing trends in incident types over time to identify emerging patterns and allocate resources effectively.
4. Assessing the effectiveness of response time in minimizing property damage and casualties.
5. Predicting the likelihood of false alarms to optimize emergency response resources.


**1.3 Identify Data Mining Tasks**

In [4]:
# Data Mining Tasks
data_mining_tasks = {
    "Predictive Modeling": [
        "Predicting incident severity (classification)",
        "Predicting false alarms (classification)"
    ],
    "Descriptive Modeling": [
        "Identifying incident hotspots (clustering)",
        "Analyzing trends in incident types (time series analysis)"
    ]
}

# Print the identified data mining tasks
print("Data Mining Tasks Needed:")
for task_type, tasks in data_mining_tasks.items():
    print(f"{task_type}:")
    for idx, task in enumerate(tasks, start=1):
        print(f"   {idx}. {task}")


Data Mining Tasks Needed:
Predictive Modeling:
   1. Predicting incident severity (classification)
   2. Predicting false alarms (classification)
Descriptive Modeling:
   1. Identifying incident hotspots (clustering)
   2. Analyzing trends in incident types (time series analysis)


#**Task 2: Data Understanding**

**2.1 Initial Data Exploration**

In [7]:
# Essential EDA (Exploratory Data Analysis)
# Check the first few rows of the dataset
print("First few rows of the dataset:")
print(metadata.head())

# Get information about the dataset
print("\nInformation about the dataset:")
print(metadata.info())

# Summary statistics of numeric columns
print("\nSummary statistics of numeric columns:")
print(metadata.describe())

# Check the data types of each variable
print("\nData types of each variable:")
print(metadata.dtypes)

# Check unique values and frequency of categorical variables
print("\nUnique values and frequency of categorical variables:")
categorical_variables = metadata.select_dtypes(include=['object']).columns
for col in categorical_variables:
    print(f"\n{col}:")
    print(metadata[col].value_counts())


First few rows of the dataset:
           Column        Sample record          Description Extra Notes
0  IncidentNumber      000008-01012018  LFB Incident Number         NaN
1      DateOfCall  2018-01-01 00:00:00     Date of 999 call         NaN
2         CalYear                 2018     Year of 999 call         NaN
3      TimeOfCall             00:04:25     Time of 999 call         NaN
4      HourOfCall                    0     Hour of 999 call         NaN

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Column         39 non-null     object
 1   Sample record  36 non-null     object
 2   Description    39 non-null     object
 3   Extra Notes    6 non-null      object
dtypes: object(4)
memory usage: 1.3+ KB
None

Summary statistics of numeric columns:
                Column  Sample record   Description         

**2.2 Identify Data Quality Issues**

In [8]:
# Check for missing values
print("\nMissing values in the dataset:")
print(metadata.isnull().sum())

# Check for outliers in numeric columns (assuming 'numeric_cols' contains the names of numeric columns)
numeric_cols = metadata.select_dtypes(include=['int', 'float']).columns
print("\nOutliers in numeric columns:")
for col in numeric_cols:
    # Define a threshold for outliers detection (e.g., 3 standard deviations from the mean)
    threshold = 3 * metadata[col].std()
    outliers = metadata[(metadata[col] > metadata[col].mean() + threshold) | (metadata[col] < metadata[col].mean() - threshold)]
    if not outliers.empty:
        print(f"{col}: {outliers.shape[0]} outliers found")

# Check for imbalanced proportions of categorical variables
print("\nImbalanced proportions of categorical variables:")
categorical_variables = metadata.select_dtypes(include=['object']).columns
for col in categorical_variables:
    print(f"{col}:")
    print(metadata[col].value_counts(normalize=True))

# Check for incomparable value ranges of numeric variables
print("\nIncomparable value ranges of numeric variables:")
for col in numeric_cols:
    min_val = metadata[col].min()
    max_val = metadata[col].max()
    if max_val - min_val == 0:
        print(f"{col}: All values are the same")
    else:
        print(f"{col}: Range = {min_val} - {max_val}")



Missing values in the dataset:
Column            0
Sample record     3
Description       0
Extra Notes      33
dtype: int64

Outliers in numeric columns:

Imbalanced proportions of categorical variables:
Column:
Column
IncidentNumber                            0.025641
FirstPumpArriving_AttendanceTime          0.025641
Northing_m                                0.025641
Easting_rounded                           0.025641
Northing_rounded                          0.025641
Latitude                                  0.025641
Longitude                                 0.025641
FRS                                       0.025641
IncidentStationGround                     0.025641
FirstPumpArriving_DeployedFromStation     0.025641
IncGeo_WardNameNew                        0.025641
SecondPumpArriving_AttendanceTime         0.025641
SecondPumpArriving_DeployedFromStation    0.025641
NumStationsWithPumpsAttending             0.025641
NumPumpsAttending                         0.025641
PumpCount      

**2.3 Evaluate Dataset Appropriateness**

In [13]:
# Assess if additional data is needed
additional_features_needed = False
if "WeatherData" not in metadata.columns:
    additional_features_needed = True
    print("Additional weather data may be needed for predictive modeling tasks.")

more_historical_data_needed = False
if "Year" in metadata.columns and metadata["Year"].max() < 2021:
    more_historical_data_needed = True
    print("More recent historical data may be needed for trend analysis.")

Additional weather data may be needed for predictive modeling tasks.


#**Task 3: Data Preparation**

**3.1 Variable Selection**

In [16]:
# Selecting relevant features for predictive modeling
predictive_features = [
    "IncidentGroup",  # High level incident category
    "StopCodeDescription",  # Detailed incident category
    "PropertyCategory",  # High level property descriptor
    "IncidentStationGround",  # LFB Station ground
    "NumPumpsAttending",  # Number of pumps in attendance
    "NumCalls"  # Number of 999 calls made for an incident
]

# Selecting relevant features for descriptive modeling
descriptive_features = [
    "IncGeo_BoroughName",  # Borough Name
    "IncGeo_WardName",  # Ward Name
    "Latitude",  # Latitude
    "Longitude",  # Longitude
    "NumStationsWithPumpsAttending"  # Number of stations with pumps in attendance
]

# Print selected features for each analysis
print("Selected features for predictive modeling:")
print(predictive_features)

print("\nSelected features for descriptive modeling:")
print(descriptive_features)

Selected features for predictive modeling:
['IncidentGroup', 'StopCodeDescription', 'PropertyCategory', 'IncidentStationGround', 'NumPumpsAttending', 'NumCalls']

Selected features for descriptive modeling:
['IncGeo_BoroughName', 'IncGeo_WardName', 'Latitude', 'Longitude', 'NumStationsWithPumpsAttending']


**3.2 Data Pre-processing**

In [23]:
# 1. Handling Missing Values
# Let's fill missing values in numeric columns with their mean and categorical columns with the mode

# Find numeric and categorical columns
numeric_cols = metadata.select_dtypes(include=['int', 'float']).columns
categorical_cols = metadata.select_dtypes(include=['object']).columns

# Fill missing values in numeric columns with mean
metadata[numeric_cols] = metadata[numeric_cols].fillna(metadata[numeric_cols].mean())

# Fill missing values in categorical columns with mode
mode_values = metadata[categorical_cols].mode()
if not mode_values.empty:
    mode_value = mode_values.iloc[0]
    metadata[categorical_cols] = metadata[categorical_cols].fillna(mode_value)

# 2. Encoding Categorical Variables (if needed)
# Check if categorical columns exist before encoding
if not categorical_cols.empty:
    # Example: Using one-hot encoding for categorical variables
    metadata = pd.get_dummies(metadata, columns=categorical_cols)


# 3. Feature Scaling (if needed)
# Check if numeric columns exist before feature scaling
if not numeric_cols.empty:
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    metadata[numeric_cols] = scaler.fit_transform(metadata[numeric_cols])

# Print first few rows of pre-processed data
print("Pre-processed data:")
print(metadata.head())

Pre-processed data:
   Column_AddressQualifier  Column_CalYear  Column_DateOfCall  \
0                    False           False              False   
1                    False           False               True   
2                    False            True              False   
3                    False           False              False   
4                    False           False              False   

   Column_Easting_m  Column_Easting_rounded  Column_FRS  \
0             False                   False       False   
1             False                   False       False   
2             False                   False       False   
3             False                   False       False   
4             False                   False       False   

   Column_FirstPumpArriving_AttendanceTime  \
0                                    False   
1                                    False   
2                                    False   
3                                    False   
4   

#**Task 4: Model Construction**

**4.1 Data Mining Tasks**

In [31]:
import pandas as pd
from sklearn.cluster import KMeans

# Load your dataset
# Assuming you've already loaded your dataset into a DataFrame named metadata

# Selecting numeric columns for clustering
numeric_columns = [
    'CalYear', 'HourOfCall', 'FirstPumpArriving_AttendanceTime',
    'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount',
    'PumpHoursRoundUp', 'Notional Cost (£)', 'NumCalls'
]

# Selecting features for clustering
X_cluster = metadata[numeric_columns]

# Check for missing values in X_cluster
missing_values = X_cluster.isnull().sum()
print("Missing values in X_cluster:")
print(missing_values)

# If there are missing values, handle them appropriately (e.g., fill with mean, median, or drop rows)
X_cluster.fillna(X_cluster.mean(), inplace=True)  # Filling missing values with mean

# Check the data types of the columns in X_cluster
print("Data types of columns in X_cluster:")
print(X_cluster.dtypes)

# If needed, convert data types to ensure compatibility with KMeans
# For example, convert object columns to numeric using pd.to_numeric()
X_cluster = X_cluster.apply(pd.to_numeric, errors='coerce')

# Now, try fitting the KMeans clustering algorithm again
kmeans = KMeans(n_clusters=3)  # Assuming 3 clusters
kmeans.fit(X_cluster)

# Adding cluster labels to the dataframe
metadata['Cluster'] = kmeans.labels_

# Print the first few rows with cluster labels
print("Data with Cluster labels:")
print(metadata.head())


Missing values in X_cluster:
CalYear                             0
HourOfCall                          0
FirstPumpArriving_AttendanceTime    0
NumStationsWithPumpsAttending       0
NumPumpsAttending                   0
PumpCount                           0
PumpHoursRoundUp                    0
Notional Cost (£)                   0
NumCalls                            0
dtype: int64
Data types of columns in X_cluster:
CalYear                             bool
HourOfCall                          bool
FirstPumpArriving_AttendanceTime    bool
NumStationsWithPumpsAttending       bool
NumPumpsAttending                   bool
PumpCount                           bool
PumpHoursRoundUp                    bool
Notional Cost (£)                   bool
NumCalls                            bool
dtype: object
Data with Cluster labels:
   AddressQualifier  CalYear  DateOfCall  Easting_m  Easting_rounded    FRS  \
0             False    False       False      False            False  False   
1            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cluster.fillna(X_cluster.mean(), inplace=True)  # Filling missing values with mean


**4.2 Model Parameter Tuning**

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming 'metadata' is the provided dataset

# Define features and target variable
predictive_features = ['CalYear', 'HourOfCall', 'NumPumpsAttending', 'NumCalls']
target_variable = 'IncidentGroup'

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(metadata[predictive_features], metadata[target_variable], test_size=0.2, random_state=42)

# Build a Random Forest Classifier model
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest Classifier:", accuracy)


Accuracy of Random Forest Classifier: 1.0


#**Task 5: Model Interpretation and Evaluation**

**5.1 Interpret Descriptive Models**

In [34]:
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming 'metadata' is the provided dataset

# Define features and target variable for predictive modeling
predictive_features = ['CalYear', 'HourOfCall', 'NumPumpsAttending', 'NumCalls']
target_variable = 'IncidentGroup'

# Split data into training and testing sets for predictive modeling
X_train_pred, X_test_pred, y_train_pred, y_test_pred = train_test_split(metadata[predictive_features], metadata[target_variable], test_size=0.2, random_state=42)

# Build and train a decision tree classifier for predictive modeling
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train_pred, y_train_pred)

# Predict on the test set for predictive modeling
y_pred_pred = decision_tree_classifier.predict(X_test_pred)

# Calculate accuracy for predictive modeling
accuracy_pred = accuracy_score(y_test_pred, y_pred_pred)
print("Accuracy of Decision Tree Classifier:", accuracy_pred)

# Define features for descriptive modeling (k-means clustering)
descriptive_features = ['CalYear', 'HourOfCall']

# Prepare data for k-means clustering
X_cluster = metadata[descriptive_features]

# Training the K-Means clustering algorithm
kmeans = KMeans(n_clusters=3, n_init=10)
kmeans.fit(X_cluster)

# Adding cluster labels to the dataframe
metadata['Cluster'] = kmeans.labels_

# Print the first few rows of the dataframe with cluster labels
print(metadata.head())


Accuracy of Decision Tree Classifier: 1.0
   AddressQualifier  CalYear  DateOfCall  Easting_m  Easting_rounded    FRS  \
0             False    False       False      False            False  False   
1             False    False        True      False            False  False   
2             False     True       False      False            False  False   
3             False    False       False      False            False  False   
4             False    False       False      False            False  False   

   FirstPumpArriving_AttendanceTime  FirstPumpArriving_DeployedFromStation  \
0                             False                                  False   
1                             False                                  False   
2                             False                                  False   
3                             False                                  False   
4                             False                                  False   

   HourOfCall 

**5.2 Compare Predictive Models**

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error

# Assuming y_true and y_pred are the true and predicted labels, respectively
accuracy = accuracy_score(y_true, y_pred)

# Assuming y_true_reg and y_pred_reg are the true and predicted regression values, respectively
mse = mean_squared_error(y_true_reg, y_pred_reg)

# Assessing overfitting using cross-validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
mean_cv_score = cv_scores.mean()

# Assessing model simplicity (for decision trees)
depth = model.get_depth()
num_nodes = model.tree_.node_count

# Assessing model simplicity (for neural networks)
num_layers = len(model.coefs_)
num_neurons = sum([layer.shape[1] for layer in model.coefs_])

# Assessing computational cost or training time
training_time = # Measure training time for each model

# Print or log the evaluation results
print(f"Accuracy: {accuracy}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Cross-Validation Score: {mean_cv_score}")
print(f"Tree Depth: {depth}")
print(f"Number of Nodes: {num_nodes}")
print(f"Number of Layers: {num_layers}")
print(f"Number of Neurons: {num_neurons}")
print(f"Training Time: {training_time}")

**5.3 Discuss Model Usefulness**

#**Task 6: Summary and Suggestions**