<a href="https://colab.research.google.com/github/LadyJ101/CODSOFT/blob/main/Zindi3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import class_weight
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb

In [None]:
train_path = '/content/drive/MyDrive/Zindi proj/train.csv'
test_path = '/content/drive/MyDrive/Zindi proj/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
train_df.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,Target,forecast_length
0,ID_KwcTp_12,11,0.3,0.0,Tumfa,atiwa_west,2025-05-30 11:09:33,,,,MEDIUMRAIN,12
1,ID_K9vWT_12,17,0.3,0.0,Kwabeng,atiwa_west,2025-05-30 11:09:35,,,,HEAVYRAIN,12
2,ID_AIQg3_12,19,0.3,0.0,Akropong,atiwa_west,2025-05-30 11:09:47,,,,MEDIUMRAIN,12
3,ID_px4yf_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:33,,,,HEAVYRAIN,12
4,ID_QYYmK_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:55,,,,HEAVYRAIN,12


In [None]:
test_df.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,forecast_length
0,ID_SbTdy_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:28,,,,24
1,ID_SBKYz_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:29,,,,24
2,ID_fAimg_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:30,,,,24
3,ID_2wBqC_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:31,,,,24
4,ID_NItox_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:32,,,,24


In [None]:
# 1. Check the constant-looking columns
print("Unique values in 'confidence':", train_df['confidence'].unique())
print("Unique values in 'predicted_intensity':", train_df['predicted_intensity'].unique())

# 2. Check the completely empty columns - Are they ALL empty?
print("\nNumber of missing values in each column:")
print(train_df.isnull().sum())

# 3. Check the nature of the 'user_id' and other categoricals
print("\nNumber of unique users:", train_df['user_id'].nunique())
print("Number of unique communities:", train_df['community'].nunique())
print("Number of unique districts:", train_df['district'].nunique())

# 4. Check the target distribution properly
print("\nTarget Distribution:")
print(train_df['Target'].value_counts())

Unique values in 'confidence': [0.3 0.6 1. ]
Unique values in 'predicted_intensity': [0.   0.66 0.33 1.  ]

Number of missing values in each column:
ID                           0
user_id                      0
confidence                   0
predicted_intensity          0
community                    0
district                     0
prediction_time              0
indicator                10425
indicator_description    10582
time_observed            10856
Target                       0
forecast_length              0
dtype: int64

Number of unique users: 43
Number of unique communities: 38
Number of unique districts: 3

Target Distribution:
Target
NORAIN        9612
MEDIUMRAIN     761
HEAVYRAIN      315
SMALLRAIN      240
Name: count, dtype: int64


In [None]:
import pandas as pd

# Make sure we have the datetime format correct
train_df['prediction_time'] = pd.to_datetime(train_df['prediction_time'])
test_df['prediction_time'] = pd.to_datetime(test_df['prediction_time'])

# --- 1. DROP USELESS COLUMNS ---
cols_to_drop = ['indicator', 'indicator_description', 'time_observed']
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop, errors='ignore') # Ignore if not in test

# --- 2. CREATE TIME-BASED FEATURES ---
for df in [train_df, test_df]:
    df['hour'] = df['prediction_time'].dt.hour
    df['day_of_week'] = df['prediction_time'].dt.dayofweek # Monday=0, Sunday=6
    df['month'] = df['prediction_time'].dt.month

# --- 3. HANDLE THE 'COMMUNITY' PROBLEM ---
# Strategy 1: Use District (only 3 categories, likely safe)
# It's already there! We will use 'district'.

# Strategy 2: Create a new feature by extracting the first part of the community name
# Let's see if this works for the new test communities
for df in [train_df, test_df]:
    # Extract the first word from the community name as a new grouping feature
    df['region'] = df['community'].apply(lambda x: x.split()[0] if isinstance(x, str) else 'Unknown')

# Let's see what we created for 'region'
print("Train 'region' value counts:")
print(train_df['region'].value_counts())
print("\nTest 'region' value counts:")
print(test_df['region'].value_counts())

# --- 4. HANDLE THE 'user_id' PROBLEM ---
# Create a feature for user activity frequency
user_frequency = train_df['user_id'].value_counts().to_dict()
# Now map this frequency to both train and test. For new users in test, we will default to 0.
train_df['user_freq'] = train_df['user_id'].map(user_frequency)
test_df['user_freq'] = test_df['user_id'].map(user_frequency).fillna(0) # New users get 0

print("\nNew features added successfully. Ready for the next step.")

Train 'region' value counts:
region
Assin        1627
Akwaduuso    1431
FOSO         1179
Asamama      1139
Akropong     1077
odumasi       849
Tumfa         624
Kwabeng       521
Awenare       457
Abomosu       434
Mampamhwe     387
akwaduuso     250
ASSIN         219
Banso         148
mouso         123
Atonsu        111
Asunafo       110
Amonom         96
Apampatia      56
Foso           50
assin          34
asunafo         6
Name: count, dtype: int64

Test 'region' value counts:
region
Akwaduuso    774
odumasi      419
ASSIN        287
Asamama      199
Akropong     197
Tumfa        164
Assin        145
Mampamhwe    125
Awenare       76
Banso         76
Kwabeng       51
Asunafo       51
FOSO          46
Abomosu       41
Asonkore      20
jimiso        19
ODUMASI       12
Amonom        10
mouso          8
Atonsu         5
Domeabra       4
Dompim         3
Name: count, dtype: int64

New features added successfully. Ready for the next step.


In [None]:
# Let's make all region names lowercase to merge them correctly
train_df['region'] = train_df['region'].str.lower()
test_df['region'] = test_df['region'].str.lower()

# Check if it worked by looking at the unique values
print("Unique regions in TRAIN set:", train_df['region'].unique())
print("Unique regions in TEST set:", test_df['region'].unique())

Unique regions in TRAIN set: ['tumfa' 'kwabeng' 'akropong' 'asamama' 'akwaduuso' 'banso' 'awenare'
 'mouso' 'abomosu' 'foso' 'amonom' 'asunafo' 'apampatia' 'assin' 'odumasi'
 'mampamhwe' 'atonsu']
Unique regions in TEST set: ['assin' 'akwaduuso' 'awenare' 'amonom' 'asunafo' 'odumasi' 'mampamhwe'
 'akropong' 'foso' 'kwabeng' 'asamama' 'jimiso' 'asonkore' 'dompim'
 'atonsu' 'tumfa' 'abomosu' 'mouso' 'banso' 'domeabra']


In [None]:
# Define which columns we want to use to train the model
features_to_use = ['confidence', 'predicted_intensity', 'district', 'forecast_length', 'hour', 'day_of_week', 'month', 'region', 'user_freq']

# Let's see what these features look like in the training data
print("Preview of our chosen features:")
print(train_df[features_to_use].head())

# Let's check their data types
print("\nData types of our features:")
print(train_df[features_to_use].dtypes)

Preview of our chosen features:
   confidence  predicted_intensity    district  forecast_length  hour  \
0         0.3                  0.0  atiwa_west               12    11   
1         0.3                  0.0  atiwa_west               12    11   
2         0.3                  0.0  atiwa_west               12    11   
3         0.3                  0.0  atiwa_west               12    11   
4         0.3                  0.0  atiwa_west               12    11   

   day_of_week  month    region  user_freq  
0            4      5     tumfa         55  
1            4      5   kwabeng        153  
2            4      5  akropong        100  
3            4      5   asamama       1130  
4            4      5   asamama       1130  

Data types of our features:
confidence             float64
predicted_intensity    float64
district                object
forecast_length          int64
hour                     int32
day_of_week              int32
month                    int32
region       

In [None]:
# We need to encode 'district' and 'region'
label_encoders = {} # We'll store the encoders here

# For each categorical column, fit a LabelEncoder on the TRAIN data and transform both TRAIN and TEST
for column in ['district', 'region']:
    le = LabelEncoder()
    # Fit the encoder on the training data
    le.fit(train_df[column])
    # Transform the training data and replace the column
    train_df[column + '_encoded'] = le.transform(train_df[column])
    # Transform the test data. New categories will be handled.
    test_df[column + '_encoded'] = test_df[column].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # Store the encoder for later if needed
    label_encoders[column] = le

# Let's check the result for the 'district' column
print("Original 'district' values:", train_df['district'].unique())
print("Encoded 'district' values:", train_df['district_encoded'].unique())
print("\nPreview of the new encoded columns:")
print(train_df[['district', 'district_encoded', 'region', 'region_encoded']].head())

Original 'district' values: ['atiwa_west' 'assin_fosu' 'obuasi_east']
Encoded 'district' values: [1 0 2]

Preview of the new encoded columns:
     district  district_encoded    region  region_encoded
0  atiwa_west                 1     tumfa              16
1  atiwa_west                 1   kwabeng              12
2  atiwa_west                 1  akropong               1
3  atiwa_west                 1   asamama               5
4  atiwa_west                 1   asamama               5


In [None]:
# Update our list of features to use the ENCODED versions
final_features = ['confidence', 'predicted_intensity', 'forecast_length', 'hour', 'day_of_week', 'month', 'user_freq', 'district_encoded', 'region_encoded']

# Create the final training set
X_train = train_df[final_features]
y_train = train_df['Target']  # This is still text: 'NORAIN', 'SMALLRAIN', etc.

# Create the final test set
X_test = test_df[final_features]

# Let's check the final shape of our data
print("Final Training Data Shape:", X_train.shape)
print("Final Test Data Shape:", X_test.shape)
print("\nFirst 2 rows of X_train:")
print(X_train.head(2))
print("\nFirst 2 rows of y_train:")
print(y_train.head(2))

Final Training Data Shape: (10928, 9)
Final Test Data Shape: (2732, 9)

First 2 rows of X_train:
   confidence  predicted_intensity  forecast_length  hour  day_of_week  month  \
0         0.3                  0.0               12    11            4      5   
1         0.3                  0.0               12    11            4      5   

   user_freq  district_encoded  region_encoded  
0         55                 1              16  
1        153                 1              12  

First 2 rows of y_train:
0    MEDIUMRAIN
1     HEAVYRAIN
Name: Target, dtype: object


In [None]:
# Create a LabelEncoder for the target
label_encoder_target = LabelEncoder()

# Fit and transform the training target
y_train_encoded = label_encoder_target.fit_transform(y_train)

# Let's see the mapping it created
print("Target Class Mapping:")
for i, class_name in enumerate(label_encoder_target.classes_):
    print(f"  {class_name} -> {i}")

# Check the result
print("\nFirst 10 original y_train values:")
print(y_train.head(10).values)
print("First 10 encoded y_train_encoded values:")
print(y_train_encoded[:10])

Target Class Mapping:
  HEAVYRAIN -> 0
  MEDIUMRAIN -> 1
  NORAIN -> 2
  SMALLRAIN -> 3

First 10 original y_train values:
['MEDIUMRAIN' 'HEAVYRAIN' 'MEDIUMRAIN' 'HEAVYRAIN' 'HEAVYRAIN' 'HEAVYRAIN'
 'HEAVYRAIN' 'HEAVYRAIN' 'HEAVYRAIN' 'HEAVYRAIN']
First 10 encoded y_train_encoded values:
[1 0 1 0 0 0 0 0 0 0]


In [None]:
# Calculate class weights. The library will give higher weight to classes with fewer samples.
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(y_train_encoded),
                                                 y=y_train_encoded)

# Convert the weights to a dictionary for the model
class_weight_dict = dict(enumerate(class_weights))

print("Calculated Class Weights (to balance the data):")
print("HEAVYRAIN (0):", class_weight_dict[0])
print("MEDIUMRAIN (1):", class_weight_dict[1])
print("NORAIN (2):", class_weight_dict[2])
print("SMALLRAIN (3):", class_weight_dict[3])

Calculated Class Weights (to balance the data):
HEAVYRAIN (0): 8.673015873015872
MEDIUMRAIN (1): 3.590013140604468
NORAIN (2): 0.2842280482729921
SMALLRAIN (3): 11.383333333333333


In [None]:
# 1. Create the model, using the class weights we calculated
baseline_model = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)

# 2. Train the model on ALL our training data
baseline_model.fit(X_train, y_train_encoded)

# 3. Let's see how it performs on the training data itself (this is a first check)
train_predictions = baseline_model.predict(X_train)

# 4. Print a classification report to see precision/recall for each class
print("Baseline Model Performance on Training Data:")
print(classification_report(y_train_encoded, train_predictions, target_names=label_encoder_target.classes_))

Baseline Model Performance on Training Data:
              precision    recall  f1-score   support

   HEAVYRAIN       0.67      1.00      0.80       315
  MEDIUMRAIN       0.85      0.91      0.88       761
      NORAIN       1.00      0.97      0.99      9612
   SMALLRAIN       0.86      1.00      0.92       240

    accuracy                           0.97     10928
   macro avg       0.84      0.97      0.90     10928
weighted avg       0.98      0.97      0.97     10928



In [None]:
# Split the training data into 80% for training and 20% for validation
X_temp, X_val, y_temp, y_val = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded)

# Train a NEW Decision Tree on the 80% subset
val_model = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)
val_model.fit(X_temp, y_temp)

# Check performance on the validation set (the unseen 20%)
val_predictions = val_model.predict(X_val)

print("Model Performance on UNSEEN Validation Data:")
print(classification_report(y_val, val_predictions, target_names=label_encoder_target.classes_))

Model Performance on UNSEEN Validation Data:
              precision    recall  f1-score   support

   HEAVYRAIN       0.64      0.97      0.77        63
  MEDIUMRAIN       0.76      0.86      0.81       152
      NORAIN       0.99      0.96      0.98      1923
   SMALLRAIN       0.83      0.90      0.86        48

    accuracy                           0.96      2186
   macro avg       0.80      0.92      0.85      2186
weighted avg       0.96      0.96      0.96      2186



In [None]:
# 1. Create the LightGBM model with our class weights and parameters tuned for performance
lgb_model = lgb.LGBMClassifier(
    random_state=42,
    class_weight=class_weight_dict, # Use our calculated weights
    n_estimators=200,
    learning_rate=0.05
)

# 2. Train it on ALL our training data (X_train, y_train_encoded)
print("Training LightGBM model... This might take a minute.")
lgb_model.fit(X_train, y_train_encoded)
print("Training complete!")

# 3. Quick check on validation data (the 20% we set aside earlier)
lgb_val_predictions = lgb_model.predict(X_val)
print("\nLightGBM Performance on UNSEEN Validation Data:")
print(classification_report(y_val, lgb_val_predictions, target_names=label_encoder_target.classes_))

Training LightGBM model... This might take a minute.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 108
[LightGBM] [Info] Number of data points in the train set: 10928, number of used features: 9
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training complete!

LightGBM Performance on UNSEEN Validation Data:
              precision    recall  f1-score   support

   HEAVYRAIN       0.64      1.00      0.78        63
  MEDIUMRAIN       0.80      0.90      0.85       152
      NORAIN       1.00      0.97      0.98      1923
   SMALLRAIN       0.87      1.00      0.93        48

    accuracy                           0

In [None]:
# 1. Use the model to predict the test set
test_predictions_encoded = lgb_model.predict(X_test)

# 2. Reverse the label encoding to get back the original string labels
#    Remember: 0->'HEAVYRAIN', 1->'MEDIUMRAIN', 2->'NORAIN', 3->'SMALLRAIN'
test_predictions = label_encoder_target.inverse_transform(test_predictions_encoded)

# 3. Create a DataFrame for submission in the EXACT format required by Zindi
submission_df = pd.DataFrame({
    'ID': test_df['ID'],        # Use the original ID column from the test set
    'Target': test_predictions  # Our predictions in string format
})

# 4. Save the submission to a CSV file
submission_file_path = 'my_zindi_submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print("Submission file created successfully!")
print("\nPreview of your submission file:")
print(submission_df.head())
print(f"\nFile saved as: {submission_file_path}")

Submission file created successfully!

Preview of your submission file:
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN

File saved as: my_zindi_submission.csv


In [None]:
from google.colab import files
files.download('my_zindi_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 1. Define a new, simpler set of features - DROP THE REGION FEATURE
simple_features = ['confidence', 'predicted_intensity', 'forecast_length', 'hour', 'day_of_week', 'month', 'user_freq', 'district_encoded']

# Create the final training set with simple features
X_train_simple = train_df[simple_features]
X_test_simple = test_df[simple_features]

# 2. Use a Random Forest model which is often more robust
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42, class_weight=class_weight_dict, n_estimators=100)
rf_model.fit(X_train_simple, y_train_encoded)

# 3. Predict on the test set
test_predictions_encoded = rf_model.predict(X_test_simple)
test_predictions = label_encoder_target.inverse_transform(test_predictions_encoded)

# 4. Create a new submission file
submission_df2 = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': test_predictions
})

submission_file_path_2 = 'my_improved_zindi_submission.csv'
submission_df2.to_csv(submission_file_path_2, index=False)

print("Improved submission file created successfully!")
print("\nPreview of your NEW submission file:")
print(submission_df2.head(10))

# 5. Check if we are in Colab to download it
try:
    from google.colab import files
    files.download('my_improved_zindi_submission.csv')
    print("\nDownloading the improved file now...")
except:
    print("\nPlease check your folder for the file: 'my_improved_zindi_submission.csv'")

Improved submission file created successfully!

Preview of your NEW submission file:
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN
5  ID_vUGbL_24  NORAIN
6  ID_4p4al_24  NORAIN
7  ID_iLnGS_24  NORAIN
8  ID_Kz9sM_24  NORAIN
9  ID_rPSVB_24  NORAIN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Downloading the improved file now...


In [None]:
# --- STEP 1: Import necessary libraries ---
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# --- STEP 2: Define our "base models" for the ensemble ---
base_models = [
    ('random_forest', RandomForestClassifier(random_state=42, class_weight=class_weight_dict, n_estimators=100)),
    ('lightgbm', lgb.LGBMClassifier(random_state=42, class_weight=class_weight_dict))
]

# --- STEP 3: Define the "meta-learner" that will combine the base models ---
meta_learner = LogisticRegression(random_state=42, max_iter=1000)

# --- STEP 4: Create the Stacking Classifier ---
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5)

# --- STEP 5: Train the ensemble on our SIMPLE feature set (without the problematic 'region') ---
print("Training the advanced Stacking Ensemble... This will take a few minutes.")
stacking_model.fit(X_train_simple, y_train_encoded)
print("Ensemble training complete!")

# --- STEP 6: Predict on the test set using the powerful ensemble ---
test_predictions_encoded = stacking_model.predict(X_test_simple)
test_predictions = label_encoder_target.inverse_transform(test_predictions_encoded)

# --- STEP 7: Create the new, improved submission file ---
submission_df3 = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': test_predictions
})

submission_file_path_3 = 'my_ensemble_zindi_submission.csv'
submission_df3.to_csv(submission_file_path_3, index=False)

print("ENSEMBLE submission file created successfully!")
print("\nPreview of your ENSEMBLE submission file:")
print(submission_df3.head(10))

# --- STEP 8: Download the file ---
try:
    from google.colab import files
    files.download('my_ensemble_zindi_submission.csv')
    print("\nDownloading the ENSEMBLE file now...")
except:
    print("\nPlease check your folder for the file: 'my_ensemble_zindi_submission.csv'")

Training the advanced Stacking Ensemble... This will take a few minutes.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 10928, number of used features: 8
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 8742, number of used features: 8
[LightGBM] [

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Downloading the ENSEMBLE file now...


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# First, ensure we have the right features: Use the SIMPLE feature set
simple_features = ['confidence', 'predicted_intensity', 'forecast_length', 'hour', 'day_of_week', 'month', 'user_freq', 'district_encoded']
X_train_simple = train_df[simple_features]
X_test_simple = test_df[simple_features]

# Identify which of our features are categorical for CatBoost
categorical_features = ['district_encoded'] # This is the only one we encoded
# Get their indices for CatBoost
categorical_feature_indices = [i for i, col in enumerate(X_train_simple.columns) if col in categorical_features]

print("Categorical feature indices for CatBoost:", categorical_feature_indices)


# Now train CatBoost
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(
    random_state=42,
    auto_class_weights='Balanced', # Handles imbalance for us
    verbose=100, # Shows progress
    iterations=500, # More than enough
    cat_features=categorical_feature_indices # Tell it which features are categorical
)

print("Training CatBoost model...")
catboost_model.fit(X_train_simple, y_train_encoded)
print("CatBoost training complete!")

# Predict
test_predictions_encoded = catboost_model.predict(X_test_simple)
test_predictions = label_encoder_target.inverse_transform(test_predictions_encoded)

# Create submission
submission_df_catboost = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': test_predictions
})

submission_file_catboost = 'my_catboost_zindi_submission.csv'
submission_df_catboost.to_csv(submission_file_catboost, index=False)

print("CatBoost submission file created!")
print(submission_df_catboost.head())

# Download
try:
    from google.colab import files
    files.download(submission_file_catboost)
except:
    print(f"File saved: {submission_file_catboost}")

Categorical feature indices for CatBoost: [7]
Training CatBoost model...
Learning rate set to 0.160568
0:	learn: 1.1526310	total: 82.2ms	remaining: 41s
100:	learn: 0.1732306	total: 4.92s	remaining: 19.4s
200:	learn: 0.1044917	total: 14.6s	remaining: 21.6s
300:	learn: 0.0811860	total: 21.7s	remaining: 14.4s
400:	learn: 0.0703833	total: 28.9s	remaining: 7.14s
499:	learn: 0.0639010	total: 31.7s	remaining: 0us
CatBoost training complete!
CatBoost submission file created!
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN


  y = column_or_1d(y, warn=True)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download('my_catboost_zindi_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# --- 1. CREATE A KILLER FEATURE ---
# The farmers' 'predicted_intensity' might align roughly with our classes.
# Let's create a feature that groups their prediction into a category similar to our target.

# Define rules to map their number to our categories
def map_intensity_to_target(x):
    if x == 0.0:
        return 'NORAIN'  # They predicted 0, likely no rain
    elif x <= 0.33:
        return 'SMALLRAIN' # They predicted a small amount
    elif x <= 0.66:
        return 'MEDIUMRAIN' # They predicted a medium amount
    else:
        return 'HEAVYRAIN' # They predicted a high amount

# Apply this to train and test
train_df['farmer_prediction'] = train_df['predicted_intensity'].apply(map_intensity_to_target)
test_df['farmer_prediction'] = test_df['predicted_intensity'].apply(map_intensity_to_target)

# Label encode this new feature
le_farmer = LabelEncoder()
train_df['farmer_prediction_encoded'] = le_farmer.fit_transform(train_df['farmer_prediction'])
# For test set, use transform. If new values appear, it will error, but there shouldn't be any.
test_df['farmer_prediction_encoded'] = le_farmer.transform(test_df['farmer_prediction'])

# --- 2. CREATE THE ULTIMATE FEATURE SET ---
# Use the simple features PLUS the new powerful feature
ultimate_features = ['confidence', 'predicted_intensity', 'forecast_length', 'hour', 'day_of_week', 'month', 'user_freq', 'district_encoded', 'farmer_prediction_encoded']

X_train_ultimate = train_df[ultimate_features]
X_test_ultimate = test_df[ultimate_features]

# --- 3. TRAIN A TUNED RANDOM FOREST ---
# Let's use parameters that are known to work well
tuned_rf = RandomForestClassifier(
    random_state=42,
    class_weight=class_weight_dict,
    n_estimators=150,
    max_depth=20,
    min_samples_split=5,
    n_jobs=-1 # Use all CPU cores for faster training
)

tuned_rf.fit(X_train_ultimate, y_train_encoded)

# --- 4. PREDICT AND SUBMIT ---
test_predictions_encoded = tuned_rf.predict(X_test_ultimate)
test_predictions = label_encoder_target.inverse_transform(test_predictions_encoded)

submission_df_ultimate = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': test_predictions
})

submission_file_ultimate = 'my_ultimate_zindi_submission.csv'
submission_df_ultimate.to_csv(submission_file_ultimate, index=False)

print("ULTIMATE submission file created!")
print(submission_df_ultimate.head())

# Download it
from google.colab import files
files.download(submission_file_ultimate)

ULTIMATE submission file created!
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np

# 1. Get prediction PROBABILITIES from our two best models on the test set
#    We'll use the ULTIMATE feature set for both for consistency

# Get probabilities from the tuned Random Forest
rf_proba = tuned_rf.predict_proba(X_test_ultimate)

# Get probabilities from the CatBoost model (using the simple feature set)
# Let's ensure it uses the same feature set order. We need to align the features.
# First, make sure X_test_simple has the right columns in the right order for CatBoost
X_test_simple_catboost = test_df[simple_features]
catboost_proba = catboost_model.predict_proba(X_test_simple_catboost)

# 2. Average the probabilities from the two models
blended_proba = (rf_proba + catboost_proba) / 2.0

# 3. Choose the class with the highest averaged probability
blended_predictions_encoded = np.argmax(blended_proba, axis=1)
blended_predictions = label_encoder_target.inverse_transform(blended_predictions_encoded)

# 4. Create the final blended submission
submission_df_blended = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': blended_predictions
})

submission_file_blended = 'my_final_blended_submission.csv'
submission_df_blended.to_csv(submission_file_blended, index=False)

print("FINAL BLENDED submission file created!")
print(submission_df_blended.head())

# Download it
from google.colab import files
files.download(submission_file_blended)

FINAL BLENDED submission file created!
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import TimeSeriesSplit

# 1. Get the predicted probabilities from our best model on the TRAINING data
#    We need these to find the best thresholds
proba = final_model.predict_proba(X_train_ultimate) # [prob_class0, prob_class1, prob_class2, prob_class3]

# 2. We will try to adjust thresholds for classes 0, 1, 3 (HEAVYRAIN, MEDIUMRAIN, SMALLRAIN)
#    We will make it easier to predict these classes by lowering their threshold.
best_score = 0
best_thresholds = [0.25, 0.25, 0.25, 0.25] # Start with default thresholds

# Try different thresholds for the rare classes
for thresh_heavy in [0.1, 0.15, 0.2, 0.25]:   # Threshold for HEAVYRAIN (class 0)
    for thresh_medium in [0.2, 0.25, 0.3]:     # Threshold for MEDIUMRAIN (class 1)
        for thresh_small in [0.1, 0.15, 0.2]:  # Threshold for SMALLRAIN (class 3)

            # Apply the new thresholds
            new_predictions = []
            for p in proba:
                # Only predict a rare class if its probability is above the new, lower threshold
                if p[0] >= thresh_heavy:
                    new_predictions.append(0) # Predict HEAVYRAIN
                elif p[3] >= thresh_small:
                    new_predictions.append(3) # Predict SMALLRAIN
                elif p[1] >= thresh_medium:
                    new_predictions.append(1) # Predict MEDIUMRAIN
                else:
                    new_predictions.append(2) # Otherwise, predict NORAIN

            new_predictions = np.array(new_predictions)

            # Calculate the Macro F1 score with these new thresholds
            score = f1_score(y_train_encoded, new_predictions, average='macro')

            # Check if this is the best score found so far
            if score > best_score:
                best_score = score
                best_thresholds = [thresh_heavy, thresh_medium, thresh_small]
                print(f"New best score: {score:.4f} | thresholds: {best_thresholds}")

print(f"\nOptimization Complete. Best Macro F1: {best_score:.4f}")
print(f"Best thresholds [HEAVY, MEDIUM, SMALL]: {best_thresholds}")

# 3. Apply the optimized thresholds to the TEST set predictions
test_proba = final_model.predict_proba(X_test_ultimate)
final_test_predictions = []

for p in test_proba:
    if p[0] >= best_thresholds[0]:
        final_test_predictions.append(0) # HEAVYRAIN
    elif p[3] >= best_thresholds[2]:
        final_test_predictions.append(3) # SMALLRAIN
    elif p[1] >= best_thresholds[1]:
        final_test_predictions.append(1) # MEDIUMRAIN
    else:
        final_test_predictions.append(2) # NORAIN

final_test_predictions = np.array(final_test_predictions)
test_predictions_optimized = label_encoder_target.inverse_transform(final_test_predictions)

# 4. Create the final, optimized submission
submission_optimized = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': test_predictions_optimized
})

submission_file_optimized = 'my_optimized_final_submission.csv'
submission_optimized.to_csv(submission_file_optimized, index=False)

print("\nOptimized submission file created!")
print(submission_optimized.head())

# Download it
from google.colab import files
files.download(submission_file_optimized)

New best score: 0.8852 | thresholds: [0.1, 0.2, 0.1]
New best score: 0.8902 | thresholds: [0.1, 0.2, 0.15]
New best score: 0.8924 | thresholds: [0.1, 0.2, 0.2]
New best score: 0.8926 | thresholds: [0.1, 0.25, 0.2]
New best score: 0.8927 | thresholds: [0.1, 0.3, 0.2]
New best score: 0.8931 | thresholds: [0.15, 0.2, 0.2]
New best score: 0.8933 | thresholds: [0.15, 0.25, 0.2]
New best score: 0.8934 | thresholds: [0.15, 0.3, 0.2]
New best score: 0.8935 | thresholds: [0.2, 0.25, 0.2]
New best score: 0.8937 | thresholds: [0.2, 0.3, 0.2]

Optimization Complete. Best Macro F1: 0.8937
Best thresholds [HEAVY, MEDIUM, SMALL]: [0.2, 0.3, 0.2]

Optimized submission file created!
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("Training Set Time Range:")
print(f"Start: {train_df['prediction_time'].min()}")
print(f"End: {train_df['prediction_time'].max()}\n")

print("Test Set Time Range:")
print(f"Start: {test_df['prediction_time'].min()}")
print(f"End: {test_df['prediction_time'].max()}")

Training Set Time Range:
Start: 2025-05-30 11:09:33
End: 2025-07-20 19:23:03

Test Set Time Range:
Start: 2025-07-20 19:27:28
End: 2025-08-04 19:36:15
