# Lib Import

In [6]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# EDA

## Data import

In [7]:
df = pd.read_csv("../Data/pet_adoption_data.csv")
df.head()

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.4981,0,0,28,14,1,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PetID               2007 non-null   int64  
 1   PetType             2007 non-null   object 
 2   Breed               2007 non-null   object 
 3   AgeMonths           2007 non-null   int64  
 4   Color               2007 non-null   object 
 5   Size                2007 non-null   object 
 6   WeightKg            2007 non-null   float64
 7   Vaccinated          2007 non-null   int64  
 8   HealthCondition     2007 non-null   int64  
 9   TimeInShelterDays   2007 non-null   int64  
 10  AdoptionFee         2007 non-null   int64  
 11  PreviousOwner       2007 non-null   int64  
 12  AdoptionLikelihood  2007 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 204.0+ KB


In [9]:
df.describe()

Unnamed: 0,PetID,AgeMonths,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
count,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0
mean,1503.0,92.279522,15.705776,0.701046,0.196313,43.974091,249.142003,0.301943,0.328351
std,579.515315,52.148363,8.327749,0.457914,0.397307,25.740253,142.88704,0.459215,0.46973
min,500.0,1.0,1.018198,0.0,0.0,1.0,0.0,0.0,0.0
25%,1001.5,48.0,8.730396,0.0,0.0,21.0,127.0,0.0,0.0
50%,1503.0,94.0,15.925416,1.0,0.0,45.0,242.0,0.0,0.0
75%,2004.5,138.0,22.73718,1.0,0.0,66.0,375.0,1.0,1.0
max,2506.0,179.0,29.995628,1.0,1.0,89.0,499.0,1.0,1.0


# Pre-processing

## Standardizing

In [10]:
x = df.drop(["PetID", "AdoptionLikelihood"], axis=1)
x

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner
0,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0
1,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0
2,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0
3,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1
4,Rabbit,Rabbit,123,Gray,Large,20.498100,0,0,28,14,1
...,...,...,...,...,...,...,...,...,...,...,...
2002,Dog,Poodle,72,Orange,Small,27.039045,1,0,66,26,1
2003,Rabbit,Rabbit,124,Brown,Small,4.726954,1,1,59,150,0
2004,Rabbit,Rabbit,113,Orange,Small,1.758592,1,0,68,302,0
2005,Dog,Labrador,12,Gray,Large,20.961592,1,0,59,478,0


In [11]:
target = df['AdoptionLikelihood']
target

0       0
1       0
2       0
3       0
4       0
       ..
2002    1
2003    0
2004    0
2005    0
2006    0
Name: AdoptionLikelihood, Length: 2007, dtype: int64

In [12]:
numerical_variables = ['AgeMonths','WeightKg','TimeInShelterDays','AdoptionFee']
scaler = StandardScaler()
x[numerical_variables] = scaler.fit_transform(x[numerical_variables])
x

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner
0,Bird,Parakeet,0.742691,Orange,Large,-1.281098,1,0,-0.659602,-0.764025,0
1,Rabbit,Rabbit,-0.369797,White,Large,0.045756,0,0,-1.397929,-0.098998,0
2,Dog,Golden Retriever,0.838595,Orange,Medium,-1.637043,0,0,1.594240,0.951044,0
3,Bird,Parakeet,0.090543,White,Small,-1.485328,0,0,0.661616,-0.225003,1
4,Rabbit,Rabbit,0.589244,Gray,Large,0.575608,0,0,-0.620743,-1.646060,1
...,...,...,...,...,...,...,...,...,...,...,...
2002,Dog,Poodle,-0.388978,Orange,Small,1.361243,1,0,0.855912,-1.562056,1
2003,Rabbit,Rabbit,0.608425,Brown,Small,-1.318671,1,1,0.583897,-0.694022,0
2004,Rabbit,Rabbit,0.397436,Orange,Small,-1.675202,1,0,0.933631,0.370021,0
2005,Dog,Labrador,-1.539828,Gray,Large,0.631278,1,0,0.583897,1.602070,0


## Label Encoding

In [14]:
cat_variables = ['PetType', 'Breed', 'Color']
for col in cat_variables:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col])
x

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner
0,0,2,0.742691,3,Large,-1.281098,1,0,-0.659602,-0.764025,0
1,3,5,-0.369797,4,Large,0.045756,0,0,-1.397929,-0.098998,0
2,2,0,0.838595,3,Medium,-1.637043,0,0,1.594240,0.951044,0
3,0,2,0.090543,4,Small,-1.485328,0,0,0.661616,-0.225003,1
4,3,5,0.589244,2,Large,0.575608,0,0,-0.620743,-1.646060,1
...,...,...,...,...,...,...,...,...,...,...,...
2002,2,4,-0.388978,3,Small,1.361243,1,0,0.855912,-1.562056,1
2003,3,5,0.608425,1,Small,-1.318671,1,1,0.583897,-0.694022,0
2004,3,5,0.397436,3,Small,-1.675202,1,0,0.933631,0.370021,0
2005,2,1,-1.539828,2,Large,0.631278,1,0,0.583897,1.602070,0


## Ordinal Encoding

In [15]:
encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])
x['Size'] = encoder.fit_transform(x[['Size']])
x        

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner
0,0,2,0.742691,3,2.0,-1.281098,1,0,-0.659602,-0.764025,0
1,3,5,-0.369797,4,2.0,0.045756,0,0,-1.397929,-0.098998,0
2,2,0,0.838595,3,1.0,-1.637043,0,0,1.594240,0.951044,0
3,0,2,0.090543,4,0.0,-1.485328,0,0,0.661616,-0.225003,1
4,3,5,0.589244,2,2.0,0.575608,0,0,-0.620743,-1.646060,1
...,...,...,...,...,...,...,...,...,...,...,...
2002,2,4,-0.388978,3,0.0,1.361243,1,0,0.855912,-1.562056,1
2003,3,5,0.608425,1,0.0,-1.318671,1,1,0.583897,-0.694022,0
2004,3,5,0.397436,3,0.0,-1.675202,1,0,0.933631,0.370021,0
2005,2,1,-1.539828,2,2.0,0.631278,1,0,0.583897,1.602070,0


## Set Bool variable type

In [16]:
bool_variables = ['Vaccinated', 'HealthCondition', 'PreviousOwner']
x[bool_variables] = x[bool_variables].astype(bool)
x

Unnamed: 0,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner
0,0,2,0.742691,3,2.0,-1.281098,True,False,-0.659602,-0.764025,False
1,3,5,-0.369797,4,2.0,0.045756,False,False,-1.397929,-0.098998,False
2,2,0,0.838595,3,1.0,-1.637043,False,False,1.594240,0.951044,False
3,0,2,0.090543,4,0.0,-1.485328,False,False,0.661616,-0.225003,True
4,3,5,0.589244,2,2.0,0.575608,False,False,-0.620743,-1.646060,True
...,...,...,...,...,...,...,...,...,...,...,...
2002,2,4,-0.388978,3,0.0,1.361243,True,False,0.855912,-1.562056,True
2003,3,5,0.608425,1,0.0,-1.318671,True,True,0.583897,-0.694022,False
2004,3,5,0.397436,3,0.0,-1.675202,True,False,0.933631,0.370021,False
2005,2,1,-1.539828,2,2.0,0.631278,True,False,0.583897,1.602070,False


In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, target, test_size=0.2, random_state=42)

In [18]:
len(x_train)

1605

In [19]:
len(x_test)

402

## Feature importances

In [22]:
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)
feature_importances = model.feature_importances_
feature_importances_df = pd.DataFrame({'Caractéristique': x.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
feature_importances_df

Unnamed: 0,Caractéristique,Importance
4,Size,0.231496
2,AgeMonths,0.170734
6,Vaccinated,0.115265
7,HealthCondition,0.090193
9,AdoptionFee,0.086056
5,WeightKg,0.084992
8,TimeInShelterDays,0.078387
1,Breed,0.072434
3,Color,0.034196
0,PetType,0.023983


# Model Creation

## kNN

### Finding best k

In [None]:
k_range = range(1, 61)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    scores.append(accuracy_score(y_test, y_pred))

In [None]:
plt.plot(k_range, scores)
plt.xlabel('Value of K for kNN')
plt.ylabel('Testing Accuracy')

### Model with best k

In [None]:
best_k = 35
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with k = {best_k} is {accuracy}")

## Logistic Regression

In [None]:
logr = LogisticRegression(solver='lbfgs', max_iter=1000)
logr.fit(x_train, y_train)
y_pred = logr.predict(x_train)
accuracy = logr.score(x_train, y_train)
print(f"Accuracy is {accuracy}")

## RandomForestClassifier

### Finding best n_estimators

In [None]:
n_range = range(135, 145)
scores = []
for n in n_range:
    classifier = RandomForestClassifier(n_estimators=n, random_state=13)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    scores.append(accuracy_score(y_test, y_pred))

In [None]:
plt.plot(n_range, scores)
plt.xlabel('Value of n_estimators for RandomForestClassifier')
plt.ylabel('Testing Accuracy')

### Model with best n_estimators

In [None]:
classifier = RandomForestClassifier(n_estimators=140, random_state=13)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy is {accuracy}")