In [12]:
import pandas as pd
# Load your dataset
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Check for missing values
print(train_data.isnull().sum())
print(test_data.isnull().sum())

# Fill missing values with mean/mode or drop rows/columns
train_data = train_data.fillna(train_data.mean())
test_data = test_data.fillna(test_data.mean())

# Separate features and targets
train_features = train_data[['Lat', 'Long_']]
train_deaths_target = train_data['Deaths']
train_cfr_target = train_data['Case_Fatality_Ratio']

test_features = test_data[['Lat', 'Long_']]

Lat                      91
Long_                    91
Deaths                 1558
Case_Fatality_Ratio      44
dtype: int64
Lat      23
Long_    23
dtype: int64


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Standardize the features
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)




In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Split the data into training and validation sets for deaths
X_train_deaths, X_val_deaths, y_train_deaths, y_val_deaths = train_test_split(train_features_scaled, train_deaths_target, test_size=0.2, random_state=42)

# Train the model for predicting deaths
deaths_model = RandomForestRegressor(n_estimators=100, random_state=42)
deaths_model.fit(X_train_deaths, y_train_deaths)

# Validate the model for deaths
val_predictions_deaths = deaths_model.predict(X_val_deaths)
rmse_deaths = np.sqrt(mean_squared_error(y_val_deaths, val_predictions_deaths))
print(f'Validation RMSE for Deaths: {rmse_deaths}')


Validation RMSE for Deaths: 37.215868083762146


In [19]:
# Split the data into training and validation sets for CFR
X_train_cfr, X_val_cfr, y_train_cfr, y_val_cfr = train_test_split(train_features_scaled, train_cfr_target, test_size=0.2, random_state=42)

# Train the model for predicting CFR
cfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
cfr_model.fit(X_train_cfr, y_train_cfr)

# Validate the model for CFR
val_predictions_cfr = cfr_model.predict(X_val_cfr)
rmse_cfr = np.sqrt(mean_squared_error(y_val_cfr, val_predictions_cfr))
print(f'Validation RMSE for Case Fatality Ratio: {rmse_cfr}')


Validation RMSE for Case Fatality Ratio: 22.17294963505846


In [28]:
# Predict deaths and CFR for the test dataset
test_predictions_deaths = deaths_model.predict(test_features_scaled)
test_predictions_cfr = cfr_model.predict(test_features_scaled)

# Combine predictions into a DataFrame
test_predictions = pd.DataFrame({
    'Lat': test_data['Lat'],
    'Long_': test_data['Long_'],
    'Predicted_Deaths': test_predictions_deaths,
    'Predicted_Case_Fatality_Ratio': test_predictions_cfr
})

print(test_predictions)


            Lat       Long_  Predicted_Deaths  Predicted_Case_Fatality_Ratio
0     41.153300   20.168300         70.753051                       1.313687
1    -71.949900   23.347000         13.408019                       0.942462
2    -35.473500  149.012400        114.148019                       0.132549
3    -34.928500  138.600700         80.079972                       0.214274
4     47.516200   14.550100         72.132807                       0.604589
...         ...         ...               ...                            ...
999   49.213800   -2.135800        107.860244                       1.088025
1000  -7.946700  -14.355900         76.065155                       1.834367
1001  52.130700   -3.783700         72.607868                       1.025133
1002   6.423800  -66.589700         57.417136                       1.000845
1003  15.552727   48.516388        137.966037                       0.902442

[1004 rows x 4 columns]
