In [52]:
import pandas as pd
from helper import zip_gz, unzip_gz
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,LogisticRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score
import xgboost as xgb

In [53]:
df = pd.read_csv("../data/flu_weather_df.csv")
df.head()

Unnamed: 0,NAME,Year,Population,QUARTER,ILITOTAL,Population_Density,cloud_cover.afternoon,humidity.afternoon,precipitation.total,temperature.min,temperature.max,temperature.afternoon,temperature.night,temperature.evening,temperature.morning,pressure.afternoon,wind.max.speed,wind.max.direction
0,Alabama,2010,4785514,4,7488,94,43.596667,45.773333,2.153333,277.351667,290.433333,287.283333,282.62,283.508333,280.126667,1020.376667,4.173333,291.02
1,Alaska,2010,713982,4,202,1,41.683333,79.2,0.588333,256.83,262.808333,259.55,261.721667,258.753333,259.638333,1007.65,3.881667,41.938333
2,Arizona,2010,6407342,4,4088,56,16.743333,32.43,0.0,274.188333,290.601667,287.216667,278.275,287.215,274.833333,1020.833333,3.233333,146.158333
3,Arkansas,2010,2921998,4,640,56,8.358333,44.506667,0.013333,278.033333,289.666667,287.205,281.496667,285.348333,278.208333,1023.538333,4.621667,260.038333
4,California,2010,37319550,4,11502,239,34.735833,48.7425,0.935,280.821667,292.448333,288.748333,284.5025,290.601667,281.256667,1020.871667,3.58,198.761667


cloud_cover.afternoon    0.024689
4      humidity.afternoon    0.016217
13         wind.max.speed    0.014681
5     precipitation.total    0.009668

In [54]:
# remove noise
df = df.drop(columns=['precipitation.total', 'wind.max.speed', 'humidity.afternoon',])
df.head()

Unnamed: 0,NAME,Year,Population,QUARTER,ILITOTAL,Population_Density,cloud_cover.afternoon,temperature.min,temperature.max,temperature.afternoon,temperature.night,temperature.evening,temperature.morning,pressure.afternoon,wind.max.direction
0,Alabama,2010,4785514,4,7488,94,43.596667,277.351667,290.433333,287.283333,282.62,283.508333,280.126667,1020.376667,291.02
1,Alaska,2010,713982,4,202,1,41.683333,256.83,262.808333,259.55,261.721667,258.753333,259.638333,1007.65,41.938333
2,Arizona,2010,6407342,4,4088,56,16.743333,274.188333,290.601667,287.216667,278.275,287.215,274.833333,1020.833333,146.158333
3,Arkansas,2010,2921998,4,640,56,8.358333,278.033333,289.666667,287.205,281.496667,285.348333,278.208333,1023.538333,260.038333
4,California,2010,37319550,4,11502,239,34.735833,280.821667,292.448333,288.748333,284.5025,290.601667,281.256667,1020.871667,198.761667


In [55]:
all_states = []
for state in df['NAME'].unique():
    all_states.append(state)

states_sorted = sorted(all_states)
print(states_sorted)

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [56]:
state_to_number = {state: i+1 for i, state in enumerate(states_sorted)}
df['State_Number'] = df['NAME'].map(state_to_number)

In [57]:
df.drop(columns=['NAME', 'Year'], inplace=True)
df.head()

Unnamed: 0,Population,QUARTER,ILITOTAL,Population_Density,cloud_cover.afternoon,temperature.min,temperature.max,temperature.afternoon,temperature.night,temperature.evening,temperature.morning,pressure.afternoon,wind.max.direction,State_Number
0,4785514,4,7488,94,43.596667,277.351667,290.433333,287.283333,282.62,283.508333,280.126667,1020.376667,291.02,1
1,713982,4,202,1,41.683333,256.83,262.808333,259.55,261.721667,258.753333,259.638333,1007.65,41.938333,2
2,6407342,4,4088,56,16.743333,274.188333,290.601667,287.216667,278.275,287.215,274.833333,1020.833333,146.158333,3
3,2921998,4,640,56,8.358333,278.033333,289.666667,287.205,281.496667,285.348333,278.208333,1023.538333,260.038333,4
4,37319550,4,11502,239,34.735833,280.821667,292.448333,288.748333,284.5025,290.601667,281.256667,1020.871667,198.761667,5


In [58]:
X = df.drop(columns=['ILITOTAL'])
y = df['ILITOTAL']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=150, random_state=42)

model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print(f"R-squared score: {r2:.4f}")

R-squared score: 0.6281


In [59]:
prediction = X_test.iloc[[7]]
prediction

Unnamed: 0,Population,QUARTER,Population_Density,cloud_cover.afternoon,temperature.min,temperature.max,temperature.afternoon,temperature.night,temperature.evening,temperature.morning,pressure.afternoon,wind.max.direction,State_Number
944,2990231,3,64,45.0,294.07,305.33,303.128333,296.553333,302.898333,294.318333,1016.833333,98.333333,24


In [60]:
y_test.iloc[[7]]

944    3681
Name: ILITOTAL, dtype: int64

In [61]:
model_prediction = model.predict(prediction)
model_prediction

array([3776.5913], dtype=float32)

In [62]:
importance_scores = model.feature_importances_

# If you want to map feature importance to the feature names (assuming X is a DataFrame)
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance_scores
}).sort_values(by='Importance', ascending=False)

# Print or view the feature importance DataFrame
print(feature_importance_df)

                  Feature  Importance
12           State_Number    0.274623
1                 QUARTER    0.183764
0              Population    0.106233
2      Population_Density    0.073887
4         temperature.min    0.064803
6   temperature.afternoon    0.046966
8     temperature.evening    0.041324
7       temperature.night    0.037319
10     pressure.afternoon    0.037057
11     wind.max.direction    0.036494
5         temperature.max    0.036308
9     temperature.morning    0.036262
3   cloud_cover.afternoon    0.024963
