<a href="https://colab.research.google.com/github/NdumisoButhelezi/00-Login/blob/master/ZINDI_fILE_DISTRIBUTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import string
import random

# -----------------------------
# 1️⃣ Load data from Google Sheet
# -----------------------------
sheet_url = 'https://docs.google.com/spreadsheets/d/14m0YbtfwWO81bE-zHsFdfrvRMB_Rg_lh/edit?usp=drive_link&ouid=116988699228228254119&rtpof=true&sd=true'
url_csv = sheet_url.replace('/edit?usp=drive_link&ouid=116988699228228254119&rtpof=true&sd=true', '/export?format=csv')
df = pd.read_csv(url_csv)
print("Data loaded successfully!")
display(df.head())

# -----------------------------
# 2️⃣ Generate random Zindi-style IDs
# -----------------------------
def generate_id(n):
    return "ID_" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))

df['ID'] = [generate_id(i) for i in range(len(df))]

# -----------------------------
# 3️⃣ Split Train/Test (70/30)
# -----------------------------
train = df.sample(frac=0.7, random_state=42)
test = df.drop(train.index).copy()

# Assume last column before ID is target
target_col = df.columns[-2]
test_target = test[['ID', target_col]].copy()
test.drop(columns=[target_col], inplace=True)

# -----------------------------
# 4️⃣ Create Reference and SampleSubmission files
# -----------------------------
reference = test_target.copy()
sample_submission = test_target.copy()
sample_submission[target_col] = 0  # or np.nan
sample_submission[target_col] = sample_submission[target_col].astype(float)

# -----------------------------
# 5️⃣ Generate VariableDefinitions.csv automatically
# -----------------------------
variable_descriptions = []
for col in df.columns:
    col_lower = col.lower()
    if 'id' in col_lower:
        variable_descriptions.append("Unique ID for each row")
    elif 'name' in col_lower:
        variable_descriptions.append("Name of the entity")
    elif 'number' in col_lower or 'count' in col_lower:
        variable_descriptions.append("Numerical count")
    elif 'type' in col_lower:
        variable_descriptions.append("Type or category")
    elif 'date' in col_lower:
        variable_descriptions.append("Date information")
    elif 'target' in col_lower:
        variable_descriptions.append("Value to predict (target)")
    elif 'score' in col_lower:
        variable_descriptions.append("Score value")
    else:
        variable_descriptions.append("Feature variable")

var_defs = pd.DataFrame({
    "Variable Name": df.columns,
    "Description": variable_descriptions
})

# -----------------------------
# 6️⃣ Save all files to CSV
# -----------------------------
train.to_csv("Train.csv", index=False)
test.to_csv("Test.csv", index=False)
reference.to_csv("Reference.csv", index=False)
sample_submission.to_csv("SampleSubmission.csv", index=False)
var_defs.to_csv("VariableDefinitions.csv", index=False)

print("✅ All Zindi competition files generated successfully with auto descriptions!")


Data loaded successfully!


Unnamed: 0,Booking ID,Event Name,Club,Venue,Campus,Date,Start Time,End Time,Status,Attendees,Created At
0,AROTCJwNJeUsOMQV2EFT,Prayer meeting,First Love Church,Ritson Campus DC1010,Durban,2025-10-03,18:00,19:00,confirmed,40,2025-09-05 08:39
1,jdwBUiRDdh0f6QH1gGNH,Prayer meeting,First Love Church,DC1012,Durban,2025-09-29,18:00,19:00,confirmed,40,2025-09-05 08:37
2,xl0OfpOcp8BdpLwUZkYZ,Prayer meeting,First Love Church,DC1012,Durban,2025-09-22,18:00,19:00,confirmed,40,2025-09-05 08:37
3,E6MajSK4qBB3xGlePZrp,Prayer meeting,First Love Church,DC1012,Durban,2025-09-15,18:00,19:00,confirmed,50,2025-09-05 08:35
4,4pz9GWByJJdwhTY7AT60,Sunday Service,First Love Church,Ritson Campus DC1010,Durban,2025-10-05,11:00,14:00,confirmed,50,2025-09-05 08:33


✅ All Zindi competition files generated successfully with auto descriptions!
