In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
import os

# Suppress warnings
warnings.filterwarnings("ignore")

# Set LightGBM verbosity
os.environ['LIGHTGBM_DEBUG'] = '0'
os.environ['VERBOSE'] = '0'
os.environ['NUM_THREADS'] = '1'

# Load the CSV file directly
data = pd.read_csv('//content/DatasetML.csv')

# Fill any null cells with a placeholder value or a strategy
data = data.fillna(data.mean(numeric_only=True)).fillna('Unknown')

# Convert categorical features to numeric
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Features (X) and target (y), excluding non-numeric and target 'Drug' column
X = data.select_dtypes(include=[float, int]).drop(columns=['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Status'])
y = data['Drug']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# Define the models
clf1 = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
clf2 = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=20)
clf3 = LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=20, verbose=-1)

# Create the ensemble model
model = VotingClassifier(estimators=[('rf', clf1), ('xgb', clf2), ('lgbm', clf3)], voting='soft')
model.fit(X_train, y_train)

# Check the accuracy on the test set
accuracy = model.score(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')

# Inverse map the encoded drug names back to original names
drug_mapping = dict(zip(label_encoders['Drug'].transform(label_encoders['Drug'].classes_), label_encoders['Drug'].classes_))

# Example of new patient data, ensure the number of features matches X
new_patient_data = pd.DataFrame(
    [[9000, 326, 18199, 6.6, 244, 3.02, 199, 1819, 170.05, 91, 132, 12.1, 4]],
    columns=[
        'id', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT',
        'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage'
    ]
)

# Fill any null cells in the new patient data
new_patient_data = new_patient_data.fillna(new_patient_data.mean(numeric_only=True)).fillna('Unknown')

# Convert the new patient data using label encoders
for column in new_patient_data.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        new_patient_data[column] = label_encoders[column].transform(new_patient_data[column])

# Ensure new_patient_data columns match X_train
new_patient_data = new_patient_data[X.columns]

# Predict the best drug for the new patient
prediction = model.predict(new_patient_data)

# Map back to original drug name
predicted_drug = drug_mapping[prediction[0]]

# Print the result with patient id
patient_id = new_patient_data['id'][0]  # Extract the patient id from the data
print(f'Patient ID: {patient_id}, Predicted Drug: {predicted_drug}')


Accuracy: 0.71
Patient ID: 9000, Predicted Drug: D-penicillamine
