<a href="https://colab.research.google.com/github/LadyJ101/dsa_project_folder/blob/main/Zindi3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils import class_weight
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb

In [4]:
train_path = '/content/drive/MyDrive/Zindi proj/train.csv'
test_path = '/content/drive/MyDrive/Zindi proj/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [5]:
train_df.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,Target,forecast_length
0,ID_KwcTp_12,11,0.3,0.0,Tumfa,atiwa_west,2025-05-30 11:09:33,,,,MEDIUMRAIN,12
1,ID_K9vWT_12,17,0.3,0.0,Kwabeng,atiwa_west,2025-05-30 11:09:35,,,,HEAVYRAIN,12
2,ID_AIQg3_12,19,0.3,0.0,Akropong,atiwa_west,2025-05-30 11:09:47,,,,MEDIUMRAIN,12
3,ID_px4yf_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:33,,,,HEAVYRAIN,12
4,ID_QYYmK_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:55,,,,HEAVYRAIN,12


In [6]:
test_df.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,forecast_length
0,ID_SbTdy_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:28,,,,24
1,ID_SBKYz_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:29,,,,24
2,ID_fAimg_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:30,,,,24
3,ID_2wBqC_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:31,,,,24
4,ID_NItox_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:32,,,,24


In [7]:
#Checking the constant looking columns
print("Unique values in 'confidence':", train_df['confidence'].unique())
print("Unique values in 'predicted_intensity':", train_df['predicted_intensity'].unique())

#Checking the completely empty columns
print("\nNumber of missing values in each column:")
print(train_df.isnull().sum())

#Checking the nature of the 'user_id' and other categoricals
print("\nNumber of unique users:", train_df['user_id'].nunique())
print("Number of unique communities:", train_df['community'].nunique())
print("Number of unique districts:", train_df['district'].nunique())

#Checking the target distribution properly
print("\nTarget Distribution:")
print(train_df['Target'].value_counts())

Unique values in 'confidence': [0.3 0.6 1. ]
Unique values in 'predicted_intensity': [0.   0.66 0.33 1.  ]

Number of missing values in each column:
ID                           0
user_id                      0
confidence                   0
predicted_intensity          0
community                    0
district                     0
prediction_time              0
indicator                10425
indicator_description    10582
time_observed            10856
Target                       0
forecast_length              0
dtype: int64

Number of unique users: 43
Number of unique communities: 38
Number of unique districts: 3

Target Distribution:
Target
NORAIN        9612
MEDIUMRAIN     761
HEAVYRAIN      315
SMALLRAIN      240
Name: count, dtype: int64


In [8]:
#Correct datetime format
train_df['prediction_time'] = pd.to_datetime(train_df['prediction_time'])
test_df['prediction_time'] = pd.to_datetime(test_df['prediction_time'])

#DROP USELESS COLUMNS
cols_to_drop = ['indicator', 'indicator_description', 'time_observed']
train_df = train_df.drop(columns=cols_to_drop)
test_df = test_df.drop(columns=cols_to_drop, errors='ignore') # Ignore if not in test

#CREATE TIME BASED FEATURES
for df in [train_df, test_df]:
    df['hour'] = df['prediction_time'].dt.hour
    df['day_of_week'] = df['prediction_time'].dt.dayofweek # Monday=0, Sunday=6
    df['month'] = df['prediction_time'].dt.month

#HANDLE THE 'COMMUNITY' PROBLEM
for df in [train_df, test_df]:
    df['region'] = df['community'].apply(lambda x: x.split()[0] if isinstance(x, str) else 'Unknown')

#What was created for 'region'
print("Train 'region' value counts:")
print(train_df['region'].value_counts())
print("\nTest 'region' value counts:")
print(test_df['region'].value_counts())

#HANDLE THE 'user_id' PROBLEM ---
user_frequency = train_df['user_id'].value_counts().to_dict()
#Frequency mapping to both train and test. For new users in test, we will default to 0.
train_df['user_freq'] = train_df['user_id'].map(user_frequency)
test_df['user_freq'] = test_df['user_id'].map(user_frequency).fillna(0) # New users get 0

print("\nNew features added successfully. Ready for the next step.")

Train 'region' value counts:
region
Assin        1627
Akwaduuso    1431
FOSO         1179
Asamama      1139
Akropong     1077
odumasi       849
Tumfa         624
Kwabeng       521
Awenare       457
Abomosu       434
Mampamhwe     387
akwaduuso     250
ASSIN         219
Banso         148
mouso         123
Atonsu        111
Asunafo       110
Amonom         96
Apampatia      56
Foso           50
assin          34
asunafo         6
Name: count, dtype: int64

Test 'region' value counts:
region
Akwaduuso    774
odumasi      419
ASSIN        287
Asamama      199
Akropong     197
Tumfa        164
Assin        145
Mampamhwe    125
Awenare       76
Banso         76
Kwabeng       51
Asunafo       51
FOSO          46
Abomosu       41
Asonkore      20
jimiso        19
ODUMASI       12
Amonom        10
mouso          8
Atonsu         5
Domeabra       4
Dompim         3
Name: count, dtype: int64

New features added successfully. Ready for the next step.


In [9]:
# Let's make all region names lowercase to merge them correctly
train_df['region'] = train_df['region'].str.lower()
test_df['region'] = test_df['region'].str.lower()

# Check if it worked by looking at the unique values
print("Unique regions in TRAIN set:", train_df['region'].unique())
print("Unique regions in TEST set:", test_df['region'].unique())

Unique regions in TRAIN set: ['tumfa' 'kwabeng' 'akropong' 'asamama' 'akwaduuso' 'banso' 'awenare'
 'mouso' 'abomosu' 'foso' 'amonom' 'asunafo' 'apampatia' 'assin' 'odumasi'
 'mampamhwe' 'atonsu']
Unique regions in TEST set: ['assin' 'akwaduuso' 'awenare' 'amonom' 'asunafo' 'odumasi' 'mampamhwe'
 'akropong' 'foso' 'kwabeng' 'asamama' 'jimiso' 'asonkore' 'dompim'
 'atonsu' 'tumfa' 'abomosu' 'mouso' 'banso' 'domeabra']


In [10]:
# Define which columns we want to use to train the model
features_to_use = ['confidence', 'predicted_intensity', 'district', 'forecast_length', 'hour', 'day_of_week', 'month', 'region', 'user_freq']

# Let's see what these features look like in the training data
print("Preview of our chosen features:")
print(train_df[features_to_use].head())

# Let's check their data types
print("\nData types of our features:")
print(train_df[features_to_use].dtypes)

Preview of our chosen features:
   confidence  predicted_intensity    district  forecast_length  hour  \
0         0.3                  0.0  atiwa_west               12    11   
1         0.3                  0.0  atiwa_west               12    11   
2         0.3                  0.0  atiwa_west               12    11   
3         0.3                  0.0  atiwa_west               12    11   
4         0.3                  0.0  atiwa_west               12    11   

   day_of_week  month    region  user_freq  
0            4      5     tumfa         55  
1            4      5   kwabeng        153  
2            4      5  akropong        100  
3            4      5   asamama       1130  
4            4      5   asamama       1130  

Data types of our features:
confidence             float64
predicted_intensity    float64
district                object
forecast_length          int64
hour                     int32
day_of_week              int32
month                    int32
region       

In [11]:
# We need to encode 'district' and 'region'
label_encoders = {} # We'll store the encoders here

# For each categorical column, fit a LabelEncoder on the TRAIN data and transform both TRAIN and TEST
for column in ['district', 'region']:
    le = LabelEncoder()
    # Fit the encoder on the training data
    le.fit(train_df[column])
    # Transform the training data and replace the column
    train_df[column + '_encoded'] = le.transform(train_df[column])
    # Transform the test data. New categories will be handled.
    test_df[column + '_encoded'] = test_df[column].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # Store the encoder for later if needed
    label_encoders[column] = le

# Let's check the result for the 'district' column
print("Original 'district' values:", train_df['district'].unique())
print("Encoded 'district' values:", train_df['district_encoded'].unique())
print("\nPreview of the new encoded columns:")
print(train_df[['district', 'district_encoded', 'region', 'region_encoded']].head())

Original 'district' values: ['atiwa_west' 'assin_fosu' 'obuasi_east']
Encoded 'district' values: [1 0 2]

Preview of the new encoded columns:
     district  district_encoded    region  region_encoded
0  atiwa_west                 1     tumfa              16
1  atiwa_west                 1   kwabeng              12
2  atiwa_west                 1  akropong               1
3  atiwa_west                 1   asamama               5
4  atiwa_west                 1   asamama               5


In [12]:
# Update our list of features to use the ENCODED versions
final_features = ['confidence', 'predicted_intensity', 'forecast_length', 'hour', 'day_of_week', 'month', 'user_freq', 'district_encoded', 'region_encoded']

# Create the final training set
X_train = train_df[final_features]
y_train = train_df['Target']  # This is still text: 'NORAIN', 'SMALLRAIN', etc.

# Create the final test set
X_test = test_df[final_features]

# Let's check the final shape of our data
print("Final Training Data Shape:", X_train.shape)
print("Final Test Data Shape:", X_test.shape)
print("\nFirst 2 rows of X_train:")
print(X_train.head(2))
print("\nFirst 2 rows of y_train:")
print(y_train.head(2))

Final Training Data Shape: (10928, 9)
Final Test Data Shape: (2732, 9)

First 2 rows of X_train:
   confidence  predicted_intensity  forecast_length  hour  day_of_week  month  \
0         0.3                  0.0               12    11            4      5   
1         0.3                  0.0               12    11            4      5   

   user_freq  district_encoded  region_encoded  
0         55                 1              16  
1        153                 1              12  

First 2 rows of y_train:
0    MEDIUMRAIN
1     HEAVYRAIN
Name: Target, dtype: object


In [13]:
# Create a LabelEncoder for the target
label_encoder_target = LabelEncoder()

# Fit and transform the training target
y_train_encoded = label_encoder_target.fit_transform(y_train)

# Let's see the mapping it created
print("Target Class Mapping:")
for i, class_name in enumerate(label_encoder_target.classes_):
    print(f"  {class_name} -> {i}")

# Check the result
print("\nFirst 10 original y_train values:")
print(y_train.head(10).values)
print("First 10 encoded y_train_encoded values:")
print(y_train_encoded[:10])

Target Class Mapping:
  HEAVYRAIN -> 0
  MEDIUMRAIN -> 1
  NORAIN -> 2
  SMALLRAIN -> 3

First 10 original y_train values:
['MEDIUMRAIN' 'HEAVYRAIN' 'MEDIUMRAIN' 'HEAVYRAIN' 'HEAVYRAIN' 'HEAVYRAIN'
 'HEAVYRAIN' 'HEAVYRAIN' 'HEAVYRAIN' 'HEAVYRAIN']
First 10 encoded y_train_encoded values:
[1 0 1 0 0 0 0 0 0 0]


In [14]:
# Calculate class weights. The library will give higher weight to classes with fewer samples.
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(y_train_encoded),
                                                 y=y_train_encoded)

# Convert the weights to a dictionary for the model
class_weight_dict = dict(enumerate(class_weights))

print("Calculated Class Weights (to balance the data):")
print("HEAVYRAIN (0):", class_weight_dict[0])
print("MEDIUMRAIN (1):", class_weight_dict[1])
print("NORAIN (2):", class_weight_dict[2])
print("SMALLRAIN (3):", class_weight_dict[3])

Calculated Class Weights (to balance the data):
HEAVYRAIN (0): 8.673015873015872
MEDIUMRAIN (1): 3.590013140604468
NORAIN (2): 0.2842280482729921
SMALLRAIN (3): 11.383333333333333


In [15]:
#Create the model, using the class weights we calculated
baseline_model = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)

#Train the model on ALL our training data
baseline_model.fit(X_train, y_train_encoded)

#How it performs on the training data itself (this is a first check)
train_predictions = baseline_model.predict(X_train)

#Classification report to see precision/recall for each class
print("Baseline Model Performance on Training Data:")
print(classification_report(y_train_encoded, train_predictions, target_names=label_encoder_target.classes_))

Baseline Model Performance on Training Data:
              precision    recall  f1-score   support

   HEAVYRAIN       0.67      1.00      0.80       315
  MEDIUMRAIN       0.85      0.91      0.88       761
      NORAIN       1.00      0.97      0.99      9612
   SMALLRAIN       0.86      1.00      0.92       240

    accuracy                           0.97     10928
   macro avg       0.84      0.97      0.90     10928
weighted avg       0.98      0.97      0.97     10928



In [16]:
# Split the training data into 80% for training and 20% for validation
X_temp, X_val, y_temp, y_val = train_test_split(X_train, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded)

# Train a NEW Decision Tree on the 80% subset
val_model = DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict)
val_model.fit(X_temp, y_temp)

# Check performance on the validation set (the unseen 20%)
val_predictions = val_model.predict(X_val)

print("Model Performance on UNSEEN Validation Data:")
print(classification_report(y_val, val_predictions, target_names=label_encoder_target.classes_))

Model Performance on UNSEEN Validation Data:
              precision    recall  f1-score   support

   HEAVYRAIN       0.64      0.97      0.77        63
  MEDIUMRAIN       0.76      0.86      0.81       152
      NORAIN       0.99      0.96      0.98      1923
   SMALLRAIN       0.83      0.90      0.86        48

    accuracy                           0.96      2186
   macro avg       0.80      0.92      0.85      2186
weighted avg       0.96      0.96      0.96      2186



In [17]:
# 1. Define a new, simpler set of features - DROP THE REGION FEATURE
simple_features = ['confidence', 'predicted_intensity', 'forecast_length', 'hour', 'day_of_week', 'month', 'user_freq', 'district_encoded']

# Create the final training set with simple features
X_train_simple = train_df[simple_features]
X_test_simple = test_df[simple_features]

#Random Forest model which is often more robust
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42, class_weight=class_weight_dict, n_estimators=100)
rf_model.fit(X_train_simple, y_train_encoded)

#Prediction on the test set
test_predictions_encoded = rf_model.predict(X_test_simple)
test_predictions = label_encoder_target.inverse_transform(test_predictions_encoded)

#New submission file
submission_df2 = pd.DataFrame({
    'ID': test_df['ID'],
    'Target': test_predictions
})

submission_file_path_2 = 'my_improved_zindi_submission.csv'
submission_df2.to_csv(submission_file_path_2, index=False)

print("Improved submission file created successfully!")
print("\nPreview of your NEW submission file:")
print(submission_df2.head(10))

#Check if we are in Colab to download it
try:
    from google.colab import files
    files.download('my_improved_zindi_submission.csv')
    print("\nDownloading the improved file now...")
except:
    print("\nPlease check your folder for the file: 'my_improved_zindi_submission.csv'")

Improved submission file created successfully!

Preview of your NEW submission file:
            ID  Target
0  ID_SbTdy_24  NORAIN
1  ID_SBKYz_24  NORAIN
2  ID_fAimg_24  NORAIN
3  ID_2wBqC_24  NORAIN
4  ID_NItox_24  NORAIN
5  ID_vUGbL_24  NORAIN
6  ID_4p4al_24  NORAIN
7  ID_iLnGS_24  NORAIN
8  ID_Kz9sM_24  NORAIN
9  ID_rPSVB_24  NORAIN


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Downloading the improved file now...


In [18]:
print("Training Set Time Range:")
print(f"Start: {train_df['prediction_time'].min()}")
print(f"End: {train_df['prediction_time'].max()}\n")

print("Test Set Time Range:")
print(f"Start: {test_df['prediction_time'].min()}")
print(f"End: {test_df['prediction_time'].max()}")

Training Set Time Range:
Start: 2025-05-30 11:09:33
End: 2025-07-20 19:23:03

Test Set Time Range:
Start: 2025-07-20 19:27:28
End: 2025-08-04 19:36:15
