## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset over here

In [2]:
data=pd.read_csv("dogs-ranking-dataset.csv")

In [3]:
data.head()

Unnamed: 0,Breed,type,score,popularity ranking,size,intelligence,congential ailments,score for kids,size.1,$LIFETIME COST,INTELLIGENCE RANK,INTELLIGENCE %,LONGEVITY(YEARS),NUMBER OF GENETIC AILMENTS,GENETIC AILMENTS,PURCHASE PRICE,FOOD COSTS PER YEAR,GROOMING FREQUNCY,SUITABILITY FOR CHILDREN
0,Border Terrier,terrier,3.61,61,1,Above average,none,4.99,small,"$22,638",30,70%,14.0,0,none,$833,$324,Once a week,1
1,Cairn Terrier,terrier,3.53,48,1,Above average,"'lion jaw', heart problems",4.91,small,"$21,992",35,61%,13.84,2,"'lion jaw', heart problems",$435,$324,Once a week,1
2,Siberian Husky,working,3.22,16,2,Average,none,4.72,medium,"$22,049",45,45%,12.58,0,none,$650,$466,Once in a few weeks,1
3,Welsh Springer Spaniel,sporting,3.34,81,2,Above average,hip problems,4.71,medium,"$20,224",31,69%,12.49,1,hip problems,$750,$324,Once a week,1
4,English Cocker Spaniel,sporting,3.33,51,2,Excellent,none,4.7,medium,"$18,993",18,82%,11.66,0,none,$800,$324,Once a week,1


In [4]:
data['LONGEVITY(YEARS)'].min()

6.29

In [5]:
data['LONGEVITY(YEARS)'].max()

16.5

## Taking care of duplicate observations if present over here

In [6]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [7]:
data.isnull().sum()

Breed                         0
type                          0
score                         0
popularity ranking            0
size                          0
intelligence                  0
congential ailments           0
score for kids                0
size.1                        0
$LIFETIME COST                0
INTELLIGENCE RANK             0
INTELLIGENCE %                0
LONGEVITY(YEARS)              0
NUMBER OF GENETIC AILMENTS    0
GENETIC AILMENTS              0
PURCHASE PRICE                0
FOOD COSTS PER YEAR           0
GROOMING FREQUNCY             0
SUITABILITY FOR CHILDREN      0
dtype: int64

## Performing Little Bit of Data Cleaning over here

In [8]:
data['$LIFETIME COST']=data['$LIFETIME COST'].str.replace("$","")
data['$LIFETIME COST']=data['$LIFETIME COST'].str.replace(",","")

data['PURCHASE PRICE']=data['PURCHASE PRICE'].str.replace("$","")
data['PURCHASE PRICE']=data['PURCHASE PRICE'].str.replace(",","")

data['FOOD COSTS PER YEAR']=data['FOOD COSTS PER YEAR'].str.replace("$","")
data['FOOD COSTS PER YEAR']=data['FOOD COSTS PER YEAR'].str.replace(",","")

data['INTELLIGENCE %']=data['INTELLIGENCE %'].str.replace("%","")

In [9]:
data.head(1)

Unnamed: 0,Breed,type,score,popularity ranking,size,intelligence,congential ailments,score for kids,size.1,$LIFETIME COST,INTELLIGENCE RANK,INTELLIGENCE %,LONGEVITY(YEARS),NUMBER OF GENETIC AILMENTS,GENETIC AILMENTS,PURCHASE PRICE,FOOD COSTS PER YEAR,GROOMING FREQUNCY,SUITABILITY FOR CHILDREN
0,Border Terrier,terrier,3.61,61,1,Above average,none,4.99,small,22638,30,70,14.0,0,none,833,324,Once a week,1


## Filtering all the numerical features over here

In [10]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

score
popularity ranking
size
score for kids
INTELLIGENCE RANK
LONGEVITY(YEARS)
NUMBER OF GENETIC AILMENTS
SUITABILITY FOR CHILDREN


In [11]:
data[numerical_features]

Unnamed: 0,score,popularity ranking,size,score for kids,INTELLIGENCE RANK,LONGEVITY(YEARS),NUMBER OF GENETIC AILMENTS,SUITABILITY FOR CHILDREN
0,3.61,61,1,4.99,30,14.00,0,1
1,3.53,48,1,4.91,35,13.84,2,1
2,3.22,16,2,4.72,45,12.58,0,1
3,3.34,81,2,4.71,31,12.49,1,1
4,3.33,51,2,4.70,18,11.66,0,1
...,...,...,...,...,...,...,...,...
82,1.82,47,3,2.57,50,10.67,2,2
83,1.66,42,3,2.54,75,6.75,2,2
84,1.76,54,2,2.51,77,9.01,2,2
85,1.95,41,3,2.33,54,10.16,1,3


## Filtering all the categorical features over here

In [12]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Breed
type
intelligence
congential ailments
size.1
$LIFETIME COST
INTELLIGENCE %
GENETIC AILMENTS
PURCHASE PRICE
FOOD COSTS PER YEAR
GROOMING FREQUNCY


In [13]:
data[cat_features]

Unnamed: 0,Breed,type,intelligence,congential ailments,size.1,$LIFETIME COST,INTELLIGENCE %,GENETIC AILMENTS,PURCHASE PRICE,FOOD COSTS PER YEAR,GROOMING FREQUNCY
0,Border Terrier,terrier,Above average,none,small,22638,70,none,833,324,Once a week
1,Cairn Terrier,terrier,Above average,"'lion jaw', heart problems",small,21992,61,"'lion jaw', heart problems",435,324,Once a week
2,Siberian Husky,working,Average,none,medium,22049,45,none,650,466,Once in a few weeks
3,Welsh Springer Spaniel,sporting,Above average,hip problems,medium,20224,69,hip problems,750,324,Once a week
4,English Cocker Spaniel,sporting,Excellent,none,medium,18993,82,none,800,324,Once a week
...,...,...,...,...,...,...,...,...,...,...,...
82,Alaskan Malamute,working,Average,"hip problems, dwarfism",large,21986,36,"hip problems, dwarfism",1210,710,Daily
83,Bloodhound,hound,Lowest,"fatal stomach bloat, skin problems",large,13824,7,"fatal stomach bloat, skin problems",608,710,Once a week
84,Chow Chow,non-sporting,Lowest,"eye, hip problems",medium,15898,5,"eye, hip problems",515,466,Daily
85,Akita,working,Average,hip problems,large,20994,31,hip problems,1202,710,Once a week


## Encoding all the categorical features into numerical features over here

In [14]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [15]:
data

Unnamed: 0,Breed,type,score,popularity ranking,size,intelligence,congential ailments,score for kids,size.1,$LIFETIME COST,INTELLIGENCE RANK,INTELLIGENCE %,LONGEVITY(YEARS),NUMBER OF GENETIC AILMENTS,GENETIC AILMENTS,PURCHASE PRICE,FOOD COSTS PER YEAR,GROOMING FREQUNCY,SUITABILITY FOR CHILDREN
0,0,0,3.61,61,1,0,0,4.99,0,0,30,0,14.00,0,0,0,0,0,1
1,1,0,3.53,48,1,0,1,4.91,0,1,35,1,13.84,2,1,1,0,0,1
2,2,1,3.22,16,2,1,0,4.72,1,2,45,2,12.58,0,0,2,1,1,1
3,3,2,3.34,81,2,0,2,4.71,1,3,31,3,12.49,1,2,3,0,0,1
4,4,2,3.33,51,2,2,0,4.70,1,4,18,4,11.66,0,0,4,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,82,1,1.82,47,3,1,44,2.57,2,82,50,63,10.67,2,47,72,5,2,2
83,83,6,1.66,42,3,5,45,2.54,2,83,75,64,6.75,2,48,73,5,0,2
84,84,3,1.76,54,2,5,37,2.51,1,84,77,65,9.01,2,40,74,1,2,2
85,85,1,1.95,41,3,1,2,2.33,2,85,54,27,10.16,1,2,75,5,0,3


In [16]:
data['LONGEVITY(Yrs)']=data['LONGEVITY(YEARS)']

In [17]:
data.drop("LONGEVITY(YEARS)",axis=1,inplace=True)

## Creating the features and labels over here

In [18]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here  

In [34]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso

In [53]:
# from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
regressor=Ridge()
regressor.fit(X_train,y_train)

In [54]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[13.18 12.58]
 [12.28 14.42]
 [13.69 12.63]
 [10.04 10.34]
 [ 9.76 10.  ]
 [12.96 12.87]
 [12.48 12.28]
 [10.05 10.  ]
 [10.55 10.02]
 [ 8.16  6.29]
 [12.5  12.54]
 [12.24 11.81]
 [12.96 13.51]
 [ 8.61  8.81]
 [13.41 12.53]
 [11.59 11.63]
 [11.67 11.  ]
 [11.01 10.1 ]]


## Evaluating the performance of the model on the testing dataset over here

In [55]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8016905037245073