summary statistics and models

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("cleaned_data_for_eda.csv")

In [5]:
#summary statistics
# Generate summary statistics for numerical columns
numerical_summary = df.describe()

# Generate summary statistics for non-numerical columns
categorical_summary = df.select_dtypes(exclude=['number']).describe()

# Combine numerical and categorical summaries
combined_summary = pd.concat([numerical_summary, categorical_summary], axis=1)

# Save the combined summary to a CSV file
combined_summary.to_csv('combined_summary.csv')

print("Combined summary statistics saved successfully.")


Combined summary statistics saved successfully.


In [6]:
# Get a list of non-numerical columns
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()

# Print the list of non-numerical columns
print("Non-numerical columns:")
print(non_numerical_columns)


Non-numerical columns:
['Park Name', 'Cause of Death', 'DeathCategory', 'Intent', 'Outcome', 'Sex', 'Age Range', 'Activity', 'date', 'latlong', 'weather_sunrise', 'weather_sunset', 'weather_conditions', 'weather_description', 'weather_icon', 'weather_stations', 'weather_hour_0_conditions', 'weather_hour_0_icon', 'weather_hour_0_stations', 'weather_hour_1_conditions', 'weather_hour_1_icon', 'weather_hour_1_stations', 'weather_hour_2_conditions', 'weather_hour_2_icon', 'weather_hour_2_stations', 'weather_hour_3_conditions', 'weather_hour_3_icon', 'weather_hour_3_stations', 'weather_hour_4_conditions', 'weather_hour_4_icon', 'weather_hour_4_stations', 'weather_hour_5_conditions', 'weather_hour_5_icon', 'weather_hour_5_stations', 'weather_hour_6_conditions', 'weather_hour_6_icon', 'weather_hour_6_stations', 'weather_hour_7_conditions', 'weather_hour_7_icon', 'weather_hour_7_stations', 'weather_hour_8_conditions', 'weather_hour_8_icon', 'weather_hour_8_stations', 'weather_hour_9_conditions'

In [7]:
# Columns to keep explicitly
columns_to_keep = ['Park Name', 'Cause of Death', 'DeathCategory', 'Sex', 'Age Range', 'Activity', 'date']

# Get all numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()

# Combine the specified columns and numerical columns
final_columns_to_keep = columns_to_keep + numerical_columns

# Subset the DataFrame to keep only these columns
df = df[final_columns_to_keep]

# Check the resulting DataFrame
print("Columns retained:")
print(df.columns)

# Save the updated DataFrame to a file (optional)
df.to_csv('filtered_with_numericals.csv', index=False)


Columns retained:
Index(['Park Name', 'Cause of Death', 'DeathCategory', 'Sex', 'Age Range',
       'Activity', 'date', 'Unnamed: 0.1', 'Unnamed: 0', 'latitude',
       ...
       'weather_hour_23_feelslike', 'weather_hour_23_humidity',
       'weather_hour_23_dew', 'weather_hour_23_precip',
       'weather_hour_23_precipprob', 'weather_hour_23_snowdepth',
       'weather_hour_23_windspeed', 'weather_hour_23_winddir',
       'weather_hour_23_pressure', 'weather_hour_23_cloudcover'],
      dtype='object', length=312)


In [8]:
# Columns to convert to dummy variables
categorical_columns = ['Park Name', 'Cause of Death', 'DeathCategory', 'Sex', 'Age Range', 'Activity']

# Convert categorical columns to dummy variables
df_with_dummies = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Check the resulting DataFrame
print("Shape of DataFrame with dummy variables:", df_with_dummies.shape)

# Save the resulting DataFrame
df_with_dummies.to_csv('dataset_with_dummies.csv', index=False)


Shape of DataFrame with dummy variables: (4551, 623)


In [9]:
# Drop rows where the class (Cause of Death) occurs less than 2 times
class_counts = df['Cause of Death'].value_counts()
valid_classes = class_counts[class_counts >= 2].index

# Filter the dataset
df = df[df['Cause of Death'].isin(valid_classes)]

print(f"Remaining classes: {df['Cause of Death'].nunique()}")


Remaining classes: 41


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Target variable: Cause of Death
y = df['Cause of Death']

# Features: drop the target column
X = df.drop(columns=['Cause of Death'])

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Logistic Regression with multinomial solver for multiclass classification
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


ValueError: could not convert string to float: 'Anacostia Park'

In [12]:
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(df['Cause of Death'])


In [13]:
# Save the target column before encoding
y = df['Cause of Death']  # Retain the unmodified target column

# Encode all non-numeric columns
non_numeric_columns = df.select_dtypes(include=['object', 'category']).columns
print(f"Non-numerical columns detected: {list(non_numeric_columns)}")

df_encoded = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)
print("Shape of DataFrame after encoding:", df_encoded.shape)

# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=[col for col in df_encoded.columns if col.startswith('Cause of Death_')], errors='ignore')

# If needed, encode the target variable separately
from sklearn.preprocessing import LabelEncoder
y_encoded = LabelEncoder().fit_transform(y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500)
model.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = model.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, zero_division=1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Non-numerical columns detected: ['Park Name', 'Cause of Death', 'DeathCategory', 'Sex', 'Age Range', 'Activity', 'date']
Shape of DataFrame after encoding: (4545, 3411)




KeyboardInterrupt: 

In [18]:
# Save the target column before encoding
y = df['Cause of Death']  # Retain the unmodified target column

# Encode all non-numeric columns
non_numeric_columns = df.select_dtypes(include=['object', 'category']).columns
print(f"Non-numerical columns detected: {list(non_numeric_columns)}")

df_encoded = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)
print("Shape of DataFrame after encoding:", df_encoded.shape)

# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=[col for col in df_encoded.columns if col.startswith('Cause of Death_')], errors='ignore')

# Encode the target variable
from sklearn.preprocessing import LabelEncoder
y_encoded = LabelEncoder().fit_transform(y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust `n_neighbors` as needed
knn.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = knn.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Non-numerical columns detected: ['Park Name', 'Cause of Death', 'DeathCategory', 'Sex', 'Age Range', 'Activity', 'date']
Shape of DataFrame after encoding: (4545, 3411)
Accuracy Score: 0.24642464246424642

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.20      0.29         5
           1       0.17      0.33      0.22         9
           2       0.00      0.00      0.00         1
           3       0.14      0.25      0.18         8
           4       0.00      0.00      0.00         4
           6       0.30      0.50      0.37       169
           7       1.00      0.00      0.00         1
           8       0.18      0.20      0.19        98
           9       1.00      0.00      0.00         1
          10       0.00      0.00      0.00         4
          11       1.00      0.00      0.00         1
          12       1.00      0.00      0.00         1
          13       1.00      0.00      0.00         1
          14 

In [14]:
# Save the target column before encoding
y = df['Cause of Death']  # Retain the unmodified target column

#Fix Date!!

# Convert the date column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Extract year and month as separate columns
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

# Decide whether to drop 'year' or keep both 'month' and 'year'
df.drop(columns=['date'], inplace=True)

# If you want to encode 'month' as categorical (optional):
df = pd.get_dummies(df, columns=['month'], drop_first=True)


# Encode all non-numeric columns
non_numeric_columns = df.select_dtypes(include=['object', 'category']).columns
print(f"Non-numerical columns detected: {list(non_numeric_columns)}")

df_encoded = pd.get_dummies(df, columns=non_numeric_columns, drop_first=True)
print("Shape of DataFrame after encoding:", df_encoded.shape)

# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=[col for col in df_encoded.columns if col.startswith('Cause of Death_')], errors='ignore')

# Encode the target variable
from sklearn.preprocessing import LabelEncoder
y_encoded = LabelEncoder().fit_transform(y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

# Random Forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of trees (n_estimators)
rf.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = rf.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Non-numerical columns detected: ['Park Name', 'Cause of Death', 'DeathCategory', 'Sex', 'Age Range', 'Activity']
Shape of DataFrame after encoding: (4545, 628)
Accuracy Score: 0.9317931793179318

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.89      0.89      0.89         9
           2       1.00      0.00      0.00         1
           3       0.78      0.88      0.82         8
           4       1.00      0.25      0.40         4
           6       0.82      0.98      0.89       169
           7       1.00      0.00      0.00         1
           8       0.96      0.98      0.97        98
           9       1.00      0.00      0.00         1
          10       1.00      0.25      0.40         4
          11       1.00      0.00      0.00         1
          12       1.00      0.00      0.00         1
          13       1.00      1.00      1.00         1
          14       1.00

In [22]:
df_encoded.to_csv('FINAL_DATASET.csv')

In [15]:
import numpy as np

unique, counts = np.unique(y_encoded, return_counts=True)
class_distribution = dict(zip(unique, counts))
print(class_distribution)


{0: 26, 1: 46, 2: 7, 3: 41, 4: 22, 5: 2, 6: 844, 7: 4, 8: 489, 9: 2, 10: 21, 11: 3, 12: 2, 13: 4, 14: 12, 15: 50, 16: 3, 17: 80, 18: 44, 19: 2, 20: 4, 21: 6, 22: 317, 23: 237, 24: 82, 25: 820, 26: 6, 27: 4, 28: 21, 29: 4, 30: 24, 31: 20, 32: 2, 33: 5, 34: 2, 35: 642, 36: 4, 37: 585, 38: 31, 39: 15, 40: 10}


In [16]:
# Add y to X with the desired column name
X_with_y = X.copy()  # Make a copy of X to avoid modifying the original
X_with_y['Cause of Death'] = y

# Confirm the result
print(X_with_y.head())  # Preview the updated DataFrame
print("Shape of the updated DataFrame:", X_with_y.shape)


   Unnamed: 0.1  Unnamed: 0   latitude   longitude  weather_tempmax  \
0             1         1.0  37.849927 -122.517752             59.6   
1             2         2.0  37.849927 -122.517752             59.6   
2             3         3.0  32.007155  -90.858514             45.3   
3             4         4.0  32.007155  -90.858514             45.3   
4             5         5.0  40.598479  -73.857927             26.0   

   weather_tempmin  weather_temp  weather_feelslikemax  weather_feelslikemin  \
0             41.2          49.0                  59.6                  37.4   
1             41.2          49.0                  59.6                  37.4   
2             21.7          34.4                  42.3                  21.7   
3             21.7          34.4                  42.3                  21.7   
4             13.9          19.6                  13.8                   1.0   

   weather_feelslike  ...  Activity_Snowmobiling  Activity_Snowshoeing  \
0               47

In [17]:
X_with_y.to_csv('Final_dataset.csv')

MAnifold learning --- t-sne and umap