In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load the dataset
data = pd.read_csv('Injury_data.csv')

# Display the first few rows of the dataset
print(data.head())

   Player_Age  Player_Weight  Player_Height  Previous_Injuries  \
0          24      66.251933     175.732429                  1   
1          37      70.996271     174.581650                  0   
2          32      80.093781     186.329618                  0   
3          28      87.473271     175.504240                  1   
4          25      84.659220     190.175012                  0   

   Training_Intensity  Recovery_Time  Likelihood_of_Injury  
0            0.457929              5                     0  
1            0.226522              6                     1  
2            0.613970              2                     1  
3            0.252858              4                     1  
4            0.577632              1                     1  


In [3]:
# Check for missing values
print(data.isnull().sum())

# Define features and target variable
X = data.drop('Likelihood_of_Injury', axis=1)  # Features
y = data['Likelihood_of_Injury']  # Target variable

# Convert categorical variables if necessary (not applicable in this dataset)
# X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Player_Age              0
Player_Weight           0
Player_Height           0
Previous_Injuries       0
Training_Intensity      0
Recovery_Time           0
Likelihood_of_Injury    0
dtype: int64


In [4]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

In [5]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [6]:
# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

# Print the classification report
print(classification_report(y_test, y_pred))

[[51 44]
 [41 64]]
              precision    recall  f1-score   support

           0       0.55      0.54      0.55        95
           1       0.59      0.61      0.60       105

    accuracy                           0.57       200
   macro avg       0.57      0.57      0.57       200
weighted avg       0.57      0.57      0.57       200



In [7]:
# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print(feature_importance_df)

              Feature  Importance
1       Player_Weight    0.249821
2       Player_Height    0.238023
4  Training_Intensity    0.233393
0          Player_Age    0.152475
5       Recovery_Time    0.089738
3   Previous_Injuries    0.036550


In [8]:
# Example of making predictions with the loaded model
new_data = np.array([[25, 70, 180, 1, 0.5, 5]])  # Example input
prediction = model.predict(new_data)
print(f'Predicted Likelihood of Injury: {prediction[0]}')

Predicted Likelihood of Injury: 1


