In [1]:
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

df = pd.read_csv('data/training/input.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146532 entries, 0 to 146531
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   date                       146532 non-null  object 
 1   state                      146532 non-null  object 
 2   city_or_county             146532 non-null  object 
 3   address                    136245 non-null  object 
 4   latitude                   146532 non-null  float64
 5   longitude                  146532 non-null  float64
 6   congressional_district     143942 non-null  float64
 7   state_house_district       127740 non-null  float64
 8   state_senate_district      132002 non-null  float64
 9   participant_age1           112187 non-null  float64
 10  participant_age_group1     133183 non-null  object 
 11  participant_gender1        137680 non-null  object 
 12  min_age_participants       112197 non-null  float64
 13  avg_age_participants       11

In [2]:
def get_decisionTree_dataset(input_dir: str, output_dir: str):
    input = pd.read_csv(input_dir)
    output = pd.read_csv(output_dir)
    df = pd.merge(input, output, left_index=True, right_index=True)

    selected_collumns = ['n_injured', 'n_arrested', 'n_unharmed', 'n_participants']

    df = df[selected_collumns + ['killed']]
    df.dropna(inplace=True)

    input = df[selected_collumns].values
    output = df['killed'].values

    # Normalizza i dati
    scaler = StandardScaler()
    input = scaler.fit_transform(input)
    
    return input, output

input, output = get_decisionTree_dataset('data/training/input.csv', 'data/training/output.csv')

**Cross validation**

In [3]:
clf = DecisionTreeClassifier(max_depth=4)

# Ottenere le previsioni di cross-validation
predictions = cross_val_predict(clf, input, output, cv=5)

# Calcolare e stampare il classification report
report = classification_report(output, predictions)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.94     93154
           1       0.88      0.75      0.81     34671

    accuracy                           0.90    127825
   macro avg       0.89      0.85      0.87    127825
weighted avg       0.90      0.90      0.90    127825



**Training del miglior modello individuato su tutto il dataset e testing**

In [4]:
input, output = get_decisionTree_dataset('data/training/input.csv', 'data/training/output.csv')
input_test, output_test = get_decisionTree_dataset('data/testing/input.csv', 'data/testing/output.csv')

In [5]:
# Creare un classificatore Decision Tree
base_estimator = DecisionTreeClassifier(max_depth=4)
clf = base_estimator

# Addestrare il classificatore
clf.fit(input, output)

# Fare previsioni sui dati di test
output_pred = clf.predict(input_test)

# Calcolare e stampare il classification report
report = classification_report(output_test, output_pred)
print("Classification Report:\n", report)

cm = confusion_matrix(output_test, output_pred)
print("Confusion Matrix:\n", cm)

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.96      0.93     23305
           1       0.87      0.74      0.80      8656

    accuracy                           0.90     31961
   macro avg       0.89      0.85      0.87     31961
weighted avg       0.90      0.90      0.90     31961

Confusion Matrix:
 [[22373   932]
 [ 2246  6410]]
