In [3]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import os


os.environ["LOKY_MAX_CPU_COUNT"] = "4"


file_path = r"D:\Programming\Projects\Data\Pet_Adoption\pet_adoption_data.xlsx"
df = pd.read_excel(file_path)


object_features = ['Pet_Type', 'Breed', 'Age_Classification', 'Color', 'Size', 'Vaccinated', 'Health_Condition', 'Health_Status', 'Previous_Owner']

pet_type = pd.get_dummies(data=df["Pet_Type"], prefix="Pet_Type")
breed = pd.get_dummies(data=df["Breed"], prefix="Breed")
age_classification = pd.get_dummies(data=df["Age_Classification"], prefix="Age_Classification")
color = pd.get_dummies(data=df["Color"], prefix="Color")
size = pd.get_dummies(data=df["Size"], prefix="Size")
vaccinated = pd.get_dummies(data=df["Vaccinated"], prefix="Vaccinated")
health_condition = pd.get_dummies(data=df["Health_Condition"], prefix="Health_Condition")
health_status = pd.get_dummies(data=df["Health_Status"], prefix="Health_Status")
previous_owner = pd.get_dummies(data=df["Previous_Owner"], prefix="Previous_Owner")

df = pd.concat([df.drop(columns=["Pet_Type", "Breed", "Age_Classification", "Color", "Size", "Vaccinated", "Health_Condition", "Health_Status", "Previous_Owner"]), pet_type, breed, age_classification, color, size, vaccinated, health_condition, health_status, previous_owner], axis=1)

features = ['Age_In_Months', 'Weight(Kg)', 'Time_In_Shelter(Days)', 'Adoption_Fee', "Pet_Type_Bird",	"Pet_Type_Cat", "Pet_Type_Dog",	"Pet_Type_Rabbit", "Breed_Golden Retriever", "Breed_Labrador", "Breed_Parakeet", "Breed_Persian", "Breed_Poodle", "Breed_Rabbit", "Breed_Siamese", "Age_Classification_Adult", "Age_Classification_Old", "Age_Classification_Young", "Color_Black", "Color_Brown", "Color_Gray","Color_Orange",	"Color_White", "Size_Large", "Size_Medium", "Size_Small", "Vaccinated_No", "Vaccinated_Yes", "Health_Condition_Healthy",	"Health_Condition_Medical Condition", "Health_Status_Mildly Healthy", "Health_Status_Not Healthy", "Health_Status_Very Healthy", 	"Previous_Owner_No", "Previous_Owner_Yes"]

X = df[features]
y = df["Adoption_Likelihood"]


encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)


synthetic = SMOTE(random_state=42)
X_synthetic, y_synthetic = synthetic.fit_resample(X, y_encoded)


X_train, X_test, y_train, y_test = train_test_split(X_synthetic, y_synthetic, test_size=0.2, random_state=42, stratify=y_synthetic)

x_train_data = pd.DataFrame(X_train)
x_test_data = pd.DataFrame(X_test)
y_test_data = pd.DataFrame(y_test)
y_train_data = pd.DataFrame(y_train)

x_train_data.to_csv("x_train_data.csv")
y_train_data.to_csv("y_train_data.csv")
x_test_data.to_csv("x_test_data.csv")
y_test_data.to_csv("y_test_data.csv")


In [10]:
test_data = pd.DataFrame(X_test)
test_data.describe

<bound method NDFrame.describe of       Age_In_Months  Weight(Kg)  Time_In_Shelter(Days)  Adoption_Fee  \
431             132   19.378869                     35           235   
557              86   16.575420                     66           314   
1342            133   20.429128                     57            27   
1322             47   16.297382                     84           239   
787              75   29.992795                     37           186   
...             ...         ...                    ...           ...   
645              86   24.739204                     41            41   
2261            136   27.847279                     59           114   
1633             26   26.721321                     10           138   
1037            175   24.804367                     17           416   
634              22   27.362814                     44           351   

      Pet_Type_Bird  Pet_Type_Cat  Pet_Type_Dog  Pet_Type_Rabbit  \
431           False          True

In [11]:
df.describe

<bound method NDFrame.describe of       Pet_ID  Age_In_Months  Weight(Kg)  Time_In_Shelter(Days)  Adoption_Fee  \
0        500            131    5.039768                     27           140   
1        501             73   16.086727                      8           235   
2        502            136    2.076286                     85           385   
3        503             97    3.339423                     61           217   
4        504            123   20.498100                     28            14   
...      ...            ...         ...                    ...           ...   
2002    2502             72   27.039045                     66            26   
2003    2503            124    4.726954                     59           150   
2004    2504            113    1.758592                     68           302   
2005    2505             12   20.961592                     59           478   
2006    2506            126   18.519788                     10           267   

     