In [None]:
import argparse
import pandas as pd
import numpy as np
import joblib
from matplotlib import pyplot as plot
#from sklearn.linear_model import train_test_split as tts
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import precision_recall_curve, average_precision_score
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

# Load dataset
def load_data(file_path):
    URL_TRAIN = "https://www.mxhackathon.co.za/docs/TrainData.csv"

    dataframeTrain = pd.read_csv(URL_TRAIN)

    #for attribute in dataframeTrain.columns:
     #if dataframeTrain[attribute].dtype == object:
        #dataframeTrain[attribute] = dataframeTrain[attribute].astype('category')



    print(dataframeTrain.describe().T)
    #print(dataframeTrain.describe(include = 'category').T)

    dataframeTrain.drop(['LeadID'], axis=1, inplace=True)
    dataframeTrain.drop(['InFinanceProcessSystemApp'], axis=1, inplace=True)
    dataframeTrain.drop(['FinanceApplied'], axis=1, inplace=True)
    dataframeTrain.drop(['FinanceApproved'], axis=1, inplace=True)

    return dataframeTrain

def analyze_missing(dataframeTrain):
    missing_summary = dataframeTrain.isnull().sum()
    missing_percentage = (missing_summary / len(dataframeTrain)) * 100
    missing_df = pd.DataFrame({
        'MissingCount': missing_summary,
        'MissingPercentage': missing_percentage
    })
    missing_df = missing_df[missing_df['MissingCount'] > 0]
    print("Missing Value Summary:")
    print(missing_df.sort_values(by='MissingPercentage', ascending=False))

def clean_missing_values(dataframeTrain):
    # Drop columns with > 50% missing values
    threshold = 0.5 * len(dataframeTrain)
    dataframeTrain = dataframeTrain.dropna(thresh=threshold, axis=1)

    # Drop rows where key fields are missing
    required_fields = ['OBSFullName', 'OBSEmail', 'Domain', 'CellPrefix', 'CellPhoneNoLength']
    existing_fields = [field for field in required_fields if field in dataframeTrain.columns]

    if existing_fields:
        dataframeTrain = dataframeTrain.dropna(subset=existing_fields)

    # Fill remaining missing values
    for col in dataframeTrain.columns:
        if dataframeTrain[col].dtype == 'object':
            dataframeTrain[col] = dataframeTrain[col].fillna('Unknown')
        elif dataframeTrain[col].dtype in ['float64', 'int64']:
           dataframeTrain[col] = dataframeTrain[col].fillna(dataframeTrain[col].median())

    return dataframeTrain


def train_ensemble_model(dataframeTrain):


    #===============================
    # Step 1: Clean the data
    #===============================

    # Encoding headers that are of type object since we can train the model using those types
    dataframeTrain['CustomerID'] = LabelEncoder().fit_transform(dataframeTrain['CustomerID'])
    dataframeTrain['CustomerIDCount'] = dataframeTrain.groupby('CustomerID')['CustomerID'].transform('count') #Using the count rather than id increases PR-AUC

    # Count how many leads a dealer handles (popularity or reach of dealer)
    dataframeTrain['DealerEnquiryVolume'] = dataframeTrain.groupby('Dealer')['Dealer'].transform('count')


    dataframeTrain['DTLeadCreatedEnc'] = LabelEncoder().fit_transform(dataframeTrain['DTLeadCreated'])
    dataframeTrain['DTLeadAllocatedEnc'] = LabelEncoder().fit_transform(dataframeTrain['DTLeadAllocated'])
    dataframeTrain['DealerEnc'] = LabelEncoder().fit_transform(dataframeTrain['Dealer'])
    dataframeTrain['LeadSourceEnc'] = LabelEncoder().fit_transform(dataframeTrain['LeadSource'])
    dataframeTrain['LeadTypeEnc'] = LabelEncoder().fit_transform(dataframeTrain['LeadType'])
    dataframeTrain['SeekEnc'] = LabelEncoder().fit_transform(dataframeTrain['Seek'])

    # Check if the lead came from high-intent platforms (example: Cars.co.za, Autotrader)
    dataframeTrain['HighIntentSource'] = dataframeTrain['LeadSource'].str.contains(
    'autotrader|cars|motus|kia|hyundai|autopedigree', case=False, na=False).astype(int)

    dataframeTrain['InterestModelEnc'] = LabelEncoder().fit_transform(dataframeTrain['InterestModel'])
    dataframeTrain['OBSFullNameEnc'] = LabelEncoder().fit_transform(dataframeTrain['OBSFullName'])
    dataframeTrain['OBSEmailEnc'] = LabelEncoder().fit_transform(dataframeTrain['OBSEmail'])
    dataframeTrain['CellPrefixEncoded'] = LabelEncoder().fit_transform(dataframeTrain['CellPrefix'])
    dataframeTrain['DomainCode'] = LabelEncoder().fit_transform(dataframeTrain['Domain'])

    # Converting to datetime
    dataframeTrain['DayOfEnquiry'] = pd.to_datetime(dataframeTrain['DayOfEnquiry'])
    dataframeTrain['HourOfEnquiry'] = pd.to_datetime(dataframeTrain['HourOfEnquiry'])

    # Extracting day of the month (1 to 31) and hour
    dataframeTrain['EnquiryDayOfMonth'] = dataframeTrain['DayOfEnquiry'].dt.day
    dataframeTrain['TimeHourOfEnquiry'] =  dataframeTrain['HourOfEnquiry'].dt.hour

    # Days since first enquiry by this customer
    first_enquiry = dataframeTrain.groupby('CustomerID')['DayOfEnquiry'].transform('min')
    dataframeTrain['DaysSinceFirstEnquiry'] = (dataframeTrain['DayOfEnquiry'] - first_enquiry).dt.days

    X = dataframeTrain[['CustomerID','CustomerIDCount','DealerEnquiryVolume','DTLeadCreatedEnc','DTLeadAllocatedEnc',
                        'DealerEnc','DaysSinceFirstEnquiry','LeadSourceEnc','LeadTypeEnc','SeekEnc',
                        'InterestModelEnc',
                        'CellPrefixEncoded','TimeHourOfEnquiry',
                        'EnquiryDayOfMonth','HighIntentSource']]
    y = dataframeTrain['VehicleSold']

    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

    sm = SMOTE(random_state=42)
    X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

    rus = RandomUnderSampler(random_state=42)
    X_train_under, y_train_under = rus.fit_resample(X_train, y_train)



    # Balance
    negative = sum(y_train == 0)
    positive = sum(y_train == 1)
    pos_weight = negative / positive if positive > 0 else 1

    # Model 2: RandomForest
    model_rf = RandomForestClassifier(n_estimators=500, random_state=42, class_weight='balanced')
    model_rf.fit(X_train, y_train)

    # Random Forest with calibration
    cal_rf = CalibratedClassifierCV(model_rf, method='sigmoid', cv=5)
    cal_rf.fit(X_train, y_train)


    # 3. Logistic Regression (with scaling)
    model_lr = Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(max_iter=1100, class_weight='balanced'))
    ])
    model_lr.fit(X_train, y_train)

    # 4. Gradient Boost
    model_gb = GradientBoostingClassifier(n_estimators=500, random_state=42)
    model_gb.fit(X_train, y_train)

    cal_gb = CalibratedClassifierCV(model_gb, method='sigmoid', cv=5)
    cal_gb.fit(X_train, y_train)

    # Combine predictions (average)

    prob_rf = cal_rf.predict_proba(X_test)[:, 1]
    prob_lr = model_lr.predict_proba(X_test)[:, 1]
    prob_gb = cal_gb.predict_proba(X_test)[:, 1]
    prob_avg = (prob_rf + prob_lr + prob_gb ) / 3


    # Evaluate PR-AUC
    pr_auc = average_precision_score(y_test, prob_avg)
    print(f"Combined PR-AUC: {pr_auc:.4f}")

    # Compute log loss
    logloss_value = log_loss(y_test, prob_avg)
    print(f"Log Loss: {logloss_value:.4f}")

    
    precision, recall, thresholds = precision_recall_curve(y_test, prob_avg)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_idx = np.argmax(f1)
    best_thresh = thresholds[best_idx]
    print(f"Best threshold (ensemble): {best_thresh:.2f}")

    # Save models and threshold
    joblib.dump({'rf': cal_rf,'lr': model_lr, 'cb': cal_gb, 'threshold': best_thresh}, 'mymodel.pkl')

    return cal_rf, model_lr, cal_gb, best_thresh


def preprocess_data(dataframeTrain):
    # Label encodings (same as training)
    dataframeTrain['CustomerID'] = LabelEncoder().fit_transform(dataframeTrain['CustomerID'])
    dataframeTrain['CustomerIDCount'] = dataframeTrain.groupby('CustomerID')['CustomerID'].transform('count') #Using the count rather than id increases PR-AUC

    # Count how many leads a dealer handles (popularity or reach of dealer)
    dataframeTrain['DealerEnquiryVolume'] = dataframeTrain.groupby('Dealer')['Dealer'].transform('count')

    dataframeTrain['DTLeadCreatedEnc'] = LabelEncoder().fit_transform(dataframeTrain['DTLeadCreated'])
    dataframeTrain['DTLeadAllocatedEnc'] = LabelEncoder().fit_transform(dataframeTrain['DTLeadAllocated'])
    dataframeTrain['DealerEnc'] = LabelEncoder().fit_transform(dataframeTrain['Dealer'])
    dataframeTrain['LeadSourceEnc'] = LabelEncoder().fit_transform(dataframeTrain['LeadSource'])
    dataframeTrain['LeadTypeEnc'] = LabelEncoder().fit_transform(dataframeTrain['LeadType'])
    dataframeTrain['SeekEnc'] = LabelEncoder().fit_transform(dataframeTrain['Seek'])

    # Check if the lead came from high-intent platforms (example: Cars.co.za, Autotrader)
    dataframeTrain['HighIntentSource'] = dataframeTrain['LeadSource'].str.contains(
    'autotrader|cars|motus|kia|hyundai|autopedigree', case=False, na=False).astype(int)

    dataframeTrain['InterestModelEnc'] = LabelEncoder().fit_transform(dataframeTrain['InterestModel'])
    dataframeTrain['OBSFullNameEnc'] = LabelEncoder().fit_transform(dataframeTrain['OBSFullName'])
    dataframeTrain['OBSEmailEnc'] = LabelEncoder().fit_transform(dataframeTrain['OBSEmail'])
    dataframeTrain['CellPrefixEncoded'] = LabelEncoder().fit_transform(dataframeTrain['CellPrefix'])
    dataframeTrain['DomainCode'] = LabelEncoder().fit_transform(dataframeTrain['Domain'])


    # Converting to datetime
    dataframeTrain['DayOfEnquiry'] = pd.to_datetime(dataframeTrain['DayOfEnquiry'])
    dataframeTrain['HourOfEnquiry'] = pd.to_datetime(dataframeTrain['HourOfEnquiry'])


    # Extracting day of the month (1 to 31) and hour
    dataframeTrain['EnquiryDayOfMonth'] = dataframeTrain['DayOfEnquiry'].dt.day
    dataframeTrain['TimeHourOfEnquiry'] =  dataframeTrain['HourOfEnquiry'].dt.hour

    # Days since first enquiry by this customer
    first_enquiry = dataframeTrain.groupby('CustomerID')['DayOfEnquiry'].transform('min')
    dataframeTrain['DaysSinceFirstEnquiry'] = (dataframeTrain['DayOfEnquiry'] - first_enquiry).dt.days

    # Final feature set
    features = dataframeTrain[['CustomerID','CustomerIDCount','DealerEnquiryVolume','DTLeadCreatedEnc','DTLeadAllocatedEnc',
                   'DealerEnc','DaysSinceFirstEnquiry','LeadSourceEnc','LeadTypeEnc','SeekEnc',
                   'InterestModelEnc',
                   'CellPrefixEncoded','TimeHourOfEnquiry',
                   'EnquiryDayOfMonth','HighIntentSource']]



    return features

def predict_from_csv(mymodel, test_csv_path):
    URL_TRAIN = "https://www.mxhackathon.co.za/docs/TrainData.csv"
    URL_TEST = "https://www.mxhackathon.co.za/docs/TestData.csv"

    df_test_raw = pd.read_csv(URL_TEST)
    df_test = df_test_raw.copy()              # Work on a copy

    X_test = preprocess_data(df_test)

    # Predict probabilities
    probabilities = mymodel.predict_proba(X_test)[:, 1]

    #thresholding (e.g. 0.8)
    threshold = model_dict['threshold']
    predictions = (probabilities >= threshold).astype(int)

    results_df = pd.DataFrame({
        'LeadID': df_test_raw['LeadID'],  # <-- original values
        'VehicleSoldProbability ': (probabilities).round(4)
    })
    #Sorting the predictions
    results_df = results_df.sort_values(by='VehicleSoldProbability ', ascending=False)


    # Save to CSV
    results_df.to_csv('LeadPredictions.csv', index=False)

    print("Saved original LeadID and VehicleSoldProbability to 'LeadPredictions.csv'")


def predict_ensemble(model_dict, test_csv_path):
    df_raw = pd.read_csv(test_csv_path)
    df = df_raw.copy()
    X = preprocess_data(df)


    prob_rf = model_dict['rf'].predict_proba(X)[:, 1]
    prob_lr = model_dict['lr'].predict_proba(X)[:, 1]
    prob_gb = model_dict['cb'].predict_proba(X)[:, 1]
    prob_avg = (prob_rf + prob_lr + prob_gb) / 3

def save_submission(df: pd.DataFrame, prob_avg: pd.Series | np.ndarray):
    submission = pd.DataFrame({
        "LeadID": df["LeadID"],
        "VehicleSoldProbability": np.round(prob_avg, 4)
    }).sort_values(by="VehicleSoldProbability", ascending=False)

    submission.to_csv("LeadPredictions.csv", index=False)
    print("Saved submission to 'LeadPredictions.csv'")


# Main program
if __name__ == "__main__":
      # Step 1: Load training data
    URL_TRAIN = "https://www.mxhackathon.co.za/docs/TrainData.csv"
    URL_TEST = "https://www.mxhackathon.co.za/docs/TestData.csv"
    
    dfTrain = load_data(URL_TRAIN)  # Required: dfTrain
    analyze_missing(dfTrain)
    dfTrain = clean_missing_values(dfTrain)

    # Step 2: Train the model
    cal_rf, model_lr, cal_gb, threshold = train_ensemble_model(dfTrain)

    # Step 3: Load test data
    dfTest = pd.read_csv(URL_TEST)  # Required: dfTest
    X_test = preprocess_data(dfTest)

    # Step 4: Predict
    model_dict = joblib.load("mymodel.pkl")
    prob_rf = model_dict['rf'].predict_proba(X_test)[:, 1]
    prob_lr = model_dict['lr'].predict_proba(X_test)[:, 1]
    prob_gb = model_dict['cb'].predict_proba(X_test)[:, 1]
    probabilities = (prob_rf + prob_lr + prob_gb) / 3

    # Step 5: Save submission
    save_submission(dfTest, probabilities)