# ML Final Project

**Headers**

In [8]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn
import scipy as scp

# Data Cleaning and Preprocessing

In [9]:
raw_data = pd.read_csv("train.csv")
raw_data.head()

Unnamed: 0,Id,Name,Intake Time,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Outcome Time,Date of Birth,Outcome Type
0,A706918,Belle,07/05/2015 12:59:00 PM,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,07/05/2015 03:13:00 PM,07/05/2007,Return to Owner
1,A724273,Runster,04/14/2016 06:43:00 PM,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,04/17/2015,Return to Owner
2,A857105,Johnny Ringo,05/12/2022 12:23:00 AM,4404 Sarasota Drive in Austin (TX),Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby,05/12/2022 02:35:00 PM,05/12/2020,Transfer
3,A743852,Odin,02/18/2017 12:46:00 PM,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,2 years,Labrador Retriever Mix,Chocolate,02/21/2017 05:44:00 PM,02/18/2015,Return to Owner
4,A635072,Beowulf,04/16/2019 09:53:00 AM,415 East Mary Street in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Great Dane Mix,Black,04/18/2019 01:45:00 PM,06/03/2012,Return to Owner


## Section A

In [10]:
# String parser

def parse_age(age_str):
    if pd.isna(age_str):
        return pd.NaT  # or np.nan
    num, unit = age_str.split()
    num = int(num)
    if unit in ["day", "days"]:
        return pd.Timedelta(days=num)
    elif unit in ["week", "weeks"]:
        return pd.Timedelta(days=num * 7) # approx for a week
    elif unit in ["month", "months"]:
        return pd.Timedelta(days=num * 30)  # Approximate 1 month = 30 days
    elif unit in ["year", "years"]:
        return pd.Timedelta(days=num * 365)  # Approximate 1 year = 365 days
    else:
        return pd.NaT

# Convert to datetime objects 
raw_data["Intake_DateTime"] = pd.to_datetime(raw_data["Intake Time"], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')
raw_data["Age_DateTime"] = raw_data["Age upon Intake"].apply(parse_age)
raw_data["DOB_DateTime"] = pd.to_datetime(raw_data["Date of Birth"], format="%m/%d/%Y", errors='coerce')
 
# check and clean age_time
missing_row = raw_data[raw_data["Age_DateTime"].isna()]
raw_data = raw_data[raw_data["Age_DateTime"].notna()]
data = raw_data
missing_intakes = data["Intake_DateTime"].isnull().sum()
missing_DOBS = data["DOB_DateTime"].isnull().sum()
data["Age_in_Days"] = data["Age_DateTime"].dt.days ## numeric value for training

## no missing values
print(missing_intakes)
print(missing_DOBS)

0
0


## Section B (Artifact).

In [None]:
data["Outcome_DateTime"] = pd.to_datetime(data["Outcome Time"], format="%m/%d/%Y %I:%M:%S %p", errors='coerce')
data["Time_spent"] = (data["Outcome_DateTime"] - data["Intake_DateTime"]).dt.total_seconds() ## This column is an artifact and has been removed in the below preprocess()
data = data.drop(columns=["Id", "Name", "Intake Time", "Found Location", "Age upon Intake", "Outcome Time", "Date of Birth"])
data.head()

## First set of cleaning done, these columns were replaced with calculable values or removed (name probably does not affect outcome)


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color,Outcome Type,Intake_DateTime,Age_DateTime,DOB_DateTime,Age_in_Days,Outcome_DateTime,Time_spent
0,Stray,Normal,Dog,Spayed Female,English Springer Spaniel,White/Liver,Return to Owner,2015-07-05 12:59:00,2920 days,2007-07-05,2920,2015-07-05 15:13:00,8040.0
1,Stray,Normal,Dog,Intact Male,Basenji Mix,Sable/White,Return to Owner,2016-04-14 18:43:00,330 days,2015-04-17,330,2016-04-21 17:17:00,599640.0
2,Public Assist,Normal,Cat,Neutered Male,Domestic Shorthair,Orange Tabby,Transfer,2022-05-12 00:23:00,730 days,2020-05-12,730,2022-05-12 14:35:00,51120.0
3,Owner Surrender,Normal,Dog,Neutered Male,Labrador Retriever Mix,Chocolate,Return to Owner,2017-02-18 12:46:00,730 days,2015-02-18,730,2017-02-21 17:44:00,277080.0
4,Public Assist,Normal,Dog,Neutered Male,Great Dane Mix,Black,Return to Owner,2019-04-16 09:53:00,2190 days,2012-06-03,2190,2019-04-18 13:45:00,186720.0


## Section C.

In [8]:
# Im looking to see what else I can erase here simply/how I should encode these categorical


print(data["Breed"].nunique()) # frequency
print(data["Color"].nunique()) #
print(data["Intake Type"].nunique())
print(data["Intake Condition"].nunique())



2440
568
6
19


## Section D.

In [9]:
# Could OHE everything, then PCA
OHE_data = pd.get_dummies(data, columns=['Color', 'Breed', "Animal Type", "Intake Type", "Intake Condition", "Sex upon Intake"])
print(OHE_data.head())
print(OHE_data.shape)

      Outcome Type     Intake_DateTime Age_DateTime DOB_DateTime  Age_in_Days  \
0  Return to Owner 2015-07-05 12:59:00    2920 days   2007-07-05         2920   
1  Return to Owner 2016-04-14 18:43:00     330 days   2015-04-17          330   
2         Transfer 2022-05-12 00:23:00     730 days   2020-05-12          730   
3  Return to Owner 2017-02-18 12:46:00     730 days   2015-02-18          730   
4  Return to Owner 2019-04-16 09:53:00    2190 days   2012-06-03         2190   

     Outcome_DateTime  Time_spent  Color_Agouti  Color_Agouti/Brown Tabby  \
0 2015-07-05 15:13:00      8040.0         False                     False   
1 2016-04-21 17:17:00    599640.0         False                     False   
2 2022-05-12 14:35:00     51120.0         False                     False   
3 2017-02-21 17:44:00    277080.0         False                     False   
4 2019-04-18 13:45:00    186720.0         False                     False   

   Color_Agouti/Cream  ...  Intake Condition_Parvo

## Section E.

In [10]:
## Make something useful from the days of intake

OHE_data["Intake_Minute"] = OHE_data["Intake_DateTime"].dt.minute
OHE_data['Intake_Hour'] = OHE_data['Intake_DateTime'].dt.hour
OHE_data['Intake_Weekday'] = OHE_data['Intake_DateTime'].dt.weekday  # Monday=0
OHE_data['Intake_Month'] = OHE_data['Intake_DateTime'].dt.month
OHE_data['Intake_Year'] = OHE_data['Intake_DateTime'].dt.year
OHE_data['Is_Weekend_Intake'] = OHE_data['Intake_Weekday'].isin([5, 6])

## Make something useful from the day of outcome

OHE_data["Outcome_Minute"] = OHE_data["Outcome_DateTime"].dt.minute
OHE_data['Outcome_Hour'] = OHE_data['Outcome_DateTime'].dt.hour
OHE_data['Outcome_Weekday'] = OHE_data['Outcome_DateTime'].dt.weekday  # Monday=0
OHE_data['Outcome_Month'] = OHE_data['Outcome_DateTime'].dt.month
OHE_data['Outcome_Year'] = OHE_data['Outcome_DateTime'].dt.year
OHE_data['Is_Weekend_Outcome'] = OHE_data['Outcome_Weekday'].isin([5, 6])

unscaled_data = OHE_data.drop(columns=["Outcome_DateTime", "Intake_DateTime", "Age_DateTime", "DOB_DateTime"]) ## redundant columns

## Cyclic encoding bc mondays are near sunday
unscaled_data['Intake_Minute_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Minute'] / 60)
unscaled_data['Intake_Minute_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Minute'] / 60)

unscaled_data['Intake_Hour_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Hour'] / 24)
unscaled_data['Intake_Hour_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Hour'] / 24)

unscaled_data['Intake_Weekday_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Weekday'] / 7)
unscaled_data['Intake_Weekday_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Weekday'] / 7)

unscaled_data['Intake_Month_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Month'] / 12)
unscaled_data['Intake_Month_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Month'] / 12)

unscaled_data['Outcome_Minute_sin'] = np.sin(2 * np.pi * OHE_data['Outcome_Minute'] / 60)
unscaled_data['Outcome_Minute_cos'] = np.cos(2 * np.pi * OHE_data['Outcome_Minute'] / 60)

unscaled_data['Outcome_Hour_sin'] = np.sin(2 * np.pi * OHE_data['Outcome_Hour'] / 24)
unscaled_data['Outcome_Hour_cos'] = np.cos(2 * np.pi * OHE_data['Outcome_Hour'] / 24)

unscaled_data['Outcome_Weekday_sin'] = np.sin(2 * np.pi * OHE_data['Outcome_Weekday'] / 7)
unscaled_data['Outcome_Weekday_cos'] = np.cos(2 * np.pi * OHE_data['Outcome_Weekday'] / 7)

unscaled_data['Outcome_Month_sin'] = np.sin(2 * np.pi * OHE_data['Outcome_Month'] / 12)
unscaled_data['Outcome_Month_cos'] = np.cos(2 * np.pi * OHE_data['Outcome_Month'] / 12)

## drop non-cyclical
unscaled_data.drop(columns=[
    'Intake_Minute', 'Intake_Hour', 'Intake_Weekday', 'Intake_Month',
    'Outcome_Minute', 'Outcome_Hour', 'Outcome_Weekday', 'Outcome_Month'
], inplace=True)
unscaled_data.shape

(111156, 3063)

## Section F.

In [11]:
## Numeric Scaling

from sklearn.preprocessing import StandardScaler

numeric_cols = unscaled_data.select_dtypes(include=['number']).columns
print(numeric_cols)


unscaled_data = unscaled_data[unscaled_data['Age_in_Days'] >= 0]
unscaled_data = unscaled_data[unscaled_data['Time_spent'] >= 0]

unscaled_data["Age_in_Days"].describe()
missing = unscaled_data.isnull().sum().sum()
print(unscaled_data.shape)
unscaled_data.head()

## scale it lol

scaler = StandardScaler()
numeric_unbounded = ["Age_in_Days", "Time_spent", "Intake_Year", "Outcome_Year"]
scaled_data = unscaled_data
scaled_data[numeric_unbounded] = scaler.fit_transform(scaled_data[numeric_unbounded])
scaled_data.head()
num_missing = scaled_data['Outcome Type'].isnull().sum()
print(num_missing)

Index(['Age_in_Days', 'Time_spent', 'Intake_Year', 'Outcome_Year',
       'Intake_Minute_sin', 'Intake_Minute_cos', 'Intake_Hour_sin',
       'Intake_Hour_cos', 'Intake_Weekday_sin', 'Intake_Weekday_cos',
       'Intake_Month_sin', 'Intake_Month_cos', 'Outcome_Minute_sin',
       'Outcome_Minute_cos', 'Outcome_Hour_sin', 'Outcome_Hour_cos',
       'Outcome_Weekday_sin', 'Outcome_Weekday_cos', 'Outcome_Month_sin',
       'Outcome_Month_cos'],
      dtype='object')
(107908, 3063)
0


## Section G.

In [12]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Step 1: Separate target from features
y = scaled_data["Outcome Type"]
X = scaled_data.drop(columns=['Outcome Type'])

# Step 2: Coerce features to numeric and cast to float32
X = X.apply(pd.to_numeric, errors='coerce')
X = X.astype('float32')

# Step 3: Reattach target to cleaned DataFrame
final_data = X.copy()
final_data['Outcome Type'] = y

# Step 4: Optional sanity checks
print("Non-numeric columns:", final_data.select_dtypes(exclude=['number']).columns)
print("Dtype counts:", final_data.dtypes.value_counts())
print("Total missing values:", final_data.isnull().sum().sum())
print("Missing in y:", final_data['Outcome Type'].isnull().sum())

# Step 5: Drop rows with missing y or missing features
final_data = final_data.dropna()
y = final_data['Outcome Type']
X = final_data.drop(columns=['Outcome Type'])

# Step 6: Encode y if it's categorical
if y.dtype == 'object' or y.dtype.name == 'category':
    le = LabelEncoder()
    y = le.fit_transform(y)

# Now X and y are clean and ready for training
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X dtype counts:", X.dtypes.value_counts())


Non-numeric columns: Index(['Outcome Type'], dtype='object')
Dtype counts: float32    3062
object        1
Name: count, dtype: int64
Total missing values: 0
Missing in y: 0
X shape: (107908, 3062)
y shape: (107908,)
X dtype counts: float32    3062
Name: count, dtype: int64


# Model Training

## Section H.

In [None]:
# Grid search for best params
"""
# Define parameter grid
param_grid = {
    'hidden_layer_sizes': [
        (64,), (128,), (256,),
        (128, 64), (256, 128), (256, 64), 
        (256, 128, 64)
    ],
    'alpha': [1e-4, 1e-3]
}


# Define base model
mlp = MLPClassifier(
    max_iter=300,
    early_stopping=True,
    random_state=42
)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=mlp,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=1,
    verbose=2
)

# Fit the grid search
grid_search.fit(X, y)

# Show the best results
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validated accuracy:", grid_search.best_score_)

# Optional: View all results sorted
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('mean_test_score', ascending=False)
print(cv_results[['mean_test_score', 'params']])
"""


'\n# Define parameter grid\nparam_grid = {\n    \'hidden_layer_sizes\': [\n        (64,), (128,), (256,),\n        (128, 64), (256, 128)\n    ],\n    \'alpha\': [1e-4, 1e-3]\n}\n\n\n# Define base model\nmlp = MLPClassifier(\n    max_iter=300,\n    early_stopping=True,\n    random_state=42\n)\n\n# Set up GridSearchCV\ngrid_search = GridSearchCV(\n    estimator=mlp,\n    param_grid=param_grid,\n    scoring=\'accuracy\',\n    cv=5,\n    n_jobs=4,\n    verbose=2\n)\n\n# Fit the grid search\ngrid_search.fit(X, y)\n\n# Show the best results\nprint("Best parameters found:", grid_search.best_params_)\nprint("Best cross-validated accuracy:", grid_search.best_score_)\n\n# Optional: View all results sorted\ncv_results = pd.DataFrame(grid_search.cv_results_)\ncv_results = cv_results.sort_values(\'mean_test_score\', ascending=False)\nprint(cv_results[[\'mean_test_score\', \'params\']])\n'

## Full Preprocessing Function

In [23]:
def preprocess(raw_data):
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import LabelEncoder


    anim_ids = raw_data["Id"].copy()
    le = None
    if 'Name' in raw_data.columns:
        raw_data = raw_data.drop(columns=['Name'])
    if 'Outcome Time' in raw_data.columns:
        raw_data = raw_data.drop(columns=['Outcome Time'])
    if 'Outcome Type' in raw_data.columns:
        y = raw_data['Outcome Type'].copy()
        raw_data = raw_data.drop(columns=["Outcome Type"])
    else:
        y = None

    # Module 1 — Age + Date parsing
    def parse_age(age_str):
        if pd.isna(age_str):
            return pd.NaT
        num, unit = age_str.split()
        num = int(num)
        if unit in ["day", "days"]:
            return pd.Timedelta(days=num)
        elif unit in ["week", "weeks"]:
            return pd.Timedelta(days=num * 7)
        elif unit in ["month", "months"]:
            return pd.Timedelta(days=num * 30)
        elif unit in ["year", "years"]:
            return pd.Timedelta(days=num * 365)
        else:
            return pd.NaT

    raw_data["Intake_DateTime"] = pd.to_datetime(
        raw_data["Intake Time"], errors='coerce', infer_datetime_format=True
    )    
    raw_data["Age_DateTime"] = raw_data["Age upon Intake"].apply(parse_age)
    raw_data["DOB_DateTime"] = pd.to_datetime(raw_data["Date of Birth"], format="%m/%d/%Y", errors='coerce')

    # Clean age
    data = raw_data.copy()
    data["Age_in_Days"] = data["Age_DateTime"].dt.days

    # Module 2 — Drop irrelevant columns
    data = data.drop(columns=["Id", "Intake Time", "Found Location", "Age upon Intake", "Date of Birth"])

    # One-hot encode categorical variables
    OHE_data = pd.get_dummies(data, columns=['Color', 'Breed', "Animal Type", "Intake Type", "Intake Condition", "Sex upon Intake"])
    print(OHE_data.shape)

    # Module 3 — Feature Engineering from Intake Time
    OHE_data["Intake_Minute"] = OHE_data["Intake_DateTime"].dt.minute
    OHE_data['Intake_Hour'] = OHE_data['Intake_DateTime'].dt.hour
    OHE_data['Intake_Weekday'] = OHE_data['Intake_DateTime'].dt.weekday
    OHE_data['Intake_Month'] = OHE_data['Intake_DateTime'].dt.month
    OHE_data['Intake_Year'] = OHE_data['Intake_DateTime'].dt.year
    OHE_data['Is_Weekend_Intake'] = OHE_data['Intake_Weekday'].isin([5, 6])

    # Drop datetimes
    unscaled_data = OHE_data.drop(columns=["Intake_DateTime", "Age_DateTime", "DOB_DateTime"])

    # Cyclical encoding
    unscaled_data['Intake_Minute_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Minute'] / 60)
    unscaled_data['Intake_Minute_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Minute'] / 60)
    unscaled_data['Intake_Hour_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Hour'] / 24)
    unscaled_data['Intake_Hour_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Hour'] / 24)
    unscaled_data['Intake_Weekday_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Weekday'] / 7)
    unscaled_data['Intake_Weekday_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Weekday'] / 7)
    unscaled_data['Intake_Month_sin'] = np.sin(2 * np.pi * OHE_data['Intake_Month'] / 12)
    unscaled_data['Intake_Month_cos'] = np.cos(2 * np.pi * OHE_data['Intake_Month'] / 12)

    unscaled_data.drop(columns=['Intake_Minute', 'Intake_Hour', 'Intake_Weekday', 'Intake_Month'], inplace=True)

    # Module 4 — Numeric Scaling
    numeric_unbounded = ["Age_in_Days", "Intake_Year"]
    unscaled_data = unscaled_data[unscaled_data['Age_in_Days'] >= 0]

    scaler = StandardScaler()
    unscaled_data[numeric_unbounded] = scaler.fit_transform(unscaled_data[numeric_unbounded])

    # Final conversion
    X = unscaled_data.apply(pd.to_numeric, errors='coerce').astype('float32')
    print("X shape before dropna:", X.shape)
    print("NaNs per column:\n", X.isnull().sum().sort_values(ascending=False).head(10))
    if y is not None:
        y = y.loc[X.index]
        y = y.dropna()
        if y.dtype == 'object' or y.dtype.name == 'category':
            le = LabelEncoder()
            y = le.fit_transform(y)


    anim_ids = anim_ids.loc[X.index]    
    print("Final shape of X:", X.shape)
    print("Total NaNs in X:", X.isnull().sum().sum())
    print("Columns with NaNs:")
    print(X.columns[X.isnull().any()])
    X = X.apply(pd.to_numeric, errors='coerce').astype('float32')
    assert X.select_dtypes(exclude=['number']).empty, "Non-numeric columns still exist in X"
    assert not X.isnull().any().any(), "X still contains NaNs"
    if y is not None:
        print("Final y dtype:", y.dtype)
        print("Final y length:", len(y))
    print("X shape:", X.shape)
    print("Original test_data rows:", len(raw_data))
    print("Final X rows:", len(X))
    print("Final test_ids:", len(anim_ids))
    return X, anim_ids, y, le

# Main

## Section J.

In [24]:
from sklearn.model_selection import cross_val_score
final_clf = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    alpha=0.001,
    max_iter=300,
    early_stopping=True,
    random_state=42
)

training_data = pd.read_csv("train.csv")
training_y = training_data["Outcome Type"]
X_train, ids_train, y_train, le = preprocess(training_data)
training_columns = X_train.columns
final_clf.fit(X_train, y_train)
print(X_train.shape)


  raw_data["Intake_DateTime"] = pd.to_datetime(


(111157, 3045)
X shape before dropna: (111152, 3052)
NaNs per column:
 Age_in_Days                                        0
Breed_Jack Russell Terrier/Standard Schnauzer      0
Breed_Jack Russell Terrier/Miniature Poodle        0
Breed_Jack Russell Terrier/Papillon                0
Breed_Jack Russell Terrier/Pbgv                    0
Breed_Jack Russell Terrier/Pembroke Welsh Corgi    0
Breed_Jack Russell Terrier/Pit Bull                0
Breed_Jack Russell Terrier/Pointer                 0
Breed_Jack Russell Terrier/Pug                     0
Breed_Jack Russell Terrier/Rat Terrier             0
dtype: int64
Final shape of X: (111152, 3052)
Total NaNs in X: 0
Columns with NaNs:
Index([], dtype='object')
Final y dtype: int32
Final y length: 111152
X shape: (111152, 3052)
Original test_data rows: 111157
Final X rows: 111152
Final test_ids: 111152
(111152, 3052)




In [16]:
print(training_data.head())

        Id          Name             Intake Time  \
0  A706918         Belle  07/05/2015 12:59:00 PM   
1  A724273       Runster  04/14/2016 06:43:00 PM   
2  A857105  Johnny Ringo  05/12/2022 12:23:00 AM   
3  A743852          Odin  02/18/2017 12:46:00 PM   
4  A635072       Beowulf  04/16/2019 09:53:00 AM   

                        Found Location      Intake Type Intake Condition  \
0     9409 Bluegrass Dr in Austin (TX)            Stray           Normal   
1   2818 Palomino Trail in Austin (TX)            Stray           Normal   
2   4404 Sarasota Drive in Austin (TX)    Public Assist           Normal   
3                          Austin (TX)  Owner Surrender           Normal   
4  415 East Mary Street in Austin (TX)    Public Assist           Normal   

  Animal Type Sex upon Intake Age upon Intake                     Breed  \
0         Dog   Spayed Female         8 years  English Springer Spaniel   
1         Dog     Intact Male       11 months               Basenji Mix   
2    

## Section K.

In [21]:
test_data = pd.read_csv("test.csv")
X_test, test_ids, _, _ = preprocess(test_data)
X_test = X_test.reindex(columns=training_columns, fill_value=0.0)

y_pred = final_clf.predict(X_test)
pred_labels = le.inverse_transform(y_pred)

submission = pd.DataFrame({
    "Id": test_ids,
    "Outcome Type": pred_labels
})
submission.to_csv("submissionb.csv", index=False)

  raw_data["Intake_DateTime"] = pd.to_datetime(
  raw_data["Intake_DateTime"] = pd.to_datetime(


(27791, 1684)
X shape before dropna: (27791, 1691)
NaNs per column:
 Age_in_Days                                       0
Breed_Jack Russell Terrier/Chihuahua Shorthair    0
Breed_Keeshond Mix                                0
Breed_Kangal/Great Dane                           0
Breed_Kangal Mix                                  0
Breed_Jindo/Shiba Inu                             0
Breed_Jindo/Pembroke Welsh Corgi                  0
Breed_Jindo Mix                                   0
Breed_Japanese Chin Mix                           0
Breed_Japanese Bobtail Mix                        0
dtype: int64
Final shape of X: (27791, 1691)
Total NaNs in X: 0
Columns with NaNs:
Index([], dtype='object')
X shape: (27791, 1691)
Original test_data rows: 27791
Final X rows: 27791
Final test_ids: 27791
