## Importing the essential libraries over here

In [24]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso

## Getting the names of the dataset over here

In [3]:
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


## Importing the dataset names over here

In [4]:
data=sns.load_dataset("healthexp")

In [5]:
data

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,1970,Germany,252.311,70.6
1,1970,France,192.143,72.2
2,1970,Great Britain,123.993,71.9
3,1970,Japan,150.437,72.0
4,1970,USA,326.961,70.9
...,...,...,...,...
269,2020,Germany,6938.983,81.1
270,2020,France,5468.418,82.3
271,2020,Great Britain,5018.700,80.4
272,2020,Japan,4665.641,84.7


## Taking care of missing values if present inside the dataset over here

In [6]:
data.isnull().sum()

Year               0
Country            0
Spending_USD       0
Life_Expectancy    0
dtype: int64

## Checking what all numerical features are there inside the dataset over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Year
Spending_USD
Life_Expectancy


In [8]:
data[numerical_features]

Unnamed: 0,Year,Spending_USD,Life_Expectancy
0,1970,252.311,70.6
1,1970,192.143,72.2
2,1970,123.993,71.9
3,1970,150.437,72.0
4,1970,326.961,70.9
...,...,...,...
269,2020,6938.983,81.1
270,2020,5468.418,82.3
271,2020,5018.700,80.4
272,2020,4665.641,84.7


## Checking what all categorical features are there inside the dataset over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Country


In [10]:
data['Country'].value_counts()

Country
Japan            51
USA              51
Germany          50
Canada           44
Great Britain    43
France           35
Name: count, dtype: int64

## Encoding the categorical features to numerical features over here

In [11]:
country_mapping={"Japan":0,"USA":1,"Germany":2,"Canada":3,"Great Britain":4,"France":5}
data['Country']=data['Country'].map(country_mapping)

In [14]:
year_mapping = {1970: 0, 1971: 1, 1972: 2, 1973: 3, 1974: 4, 1975: 5, 1976: 6, 1977: 7, 1978: 8, 1979: 9,
                    1980: 10, 1981: 11, 1982: 12, 1983: 13, 1984: 14, 1985: 15, 1986: 16, 1987: 17, 1988: 18, 1989: 19,
                    1990: 20, 1991: 21, 1992: 22, 1993: 23, 1994: 24, 1995: 25, 1996: 26, 1997: 27, 1998: 28, 1999: 29,
                    2000: 30, 2001: 31, 2002: 32, 2003: 33, 2004: 34, 2005: 35, 2006: 36, 2007: 37, 2008: 38, 2009: 39,
                    2010: 40, 2011: 41, 2012: 42, 2013: 43, 2014: 44, 2015: 45, 2016: 46, 2017: 47, 2018: 48, 2019: 49,
                    2020: 50}
data['Year']=data['Year'].map(year_mapping)


In [15]:
data

Unnamed: 0,Year,Country,Spending_USD,Life_Expectancy
0,0,2,252.311,70.6
1,0,5,192.143,72.2
2,0,4,123.993,71.9
3,0,0,150.437,72.0
4,0,1,326.961,70.9
...,...,...,...,...
269,50,2,6938.983,81.1
270,50,5,5468.418,82.3
271,50,4,5018.700,80.4
272,50,0,4665.641,84.7


## Creating the features and labels over here

In [18]:
X=data.drop(["Spending_USD"],axis=1).values
y=data['Spending_USD'].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model over here

In [25]:
regressor=XGBRegressor()
regressor.fit(X_train,y_train)

## Evaluating the Performance of the model on the testing dataset over here

In [28]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))


[[4336.25 4382.96]
 [4554.28 4623.11]
 [3880.84 3696.2 ]
 [3286.56 3290.07]
 [3429.95 3091.13]
 [3239.77 3196.41]
 [1699.77 1706.57]
 [8519.62 9355.3 ]
 [ 501.92  530.37]
 [2799.2  3010.55]
 [7879.25 7922.99]
 [5669.06 5704.68]
 [3169.19 3657.53]
 [2169.45 2225.67]
 [ 842.8  1103.52]
 [ 439.3   356.75]
 [ 163.85  198.64]
 [1847.79 1864.21]
 [1196.56 1263.74]
 [3291.91 3270.02]
 [ 433.96  449.24]
 [1420.27 1430.49]
 [4045.07 4100.6 ]
 [2758.07 2779.54]
 [1212.85 1198.95]
 [ 134.17  182.98]
 [ 782.61 1056.31]
 [8925.88 4641.65]
 [5726.54 3843.97]
 [ 688.05  740.53]
 [2431.3  2376.72]
 [1984.94 1972.87]
 [ 930.7   996.4 ]
 [5151.71 4657.14]
 [1847.77 1729.53]
 [2566.   2575.74]
 [4536.81 4572.4 ]
 [2195.39 2279.71]
 [1298.56 1404.94]
 [4189.71 4538.28]
 [2019.31 2083.63]
 [4951.68 4586.07]
 [3391.52 2950.35]
 [1930.89 1879.84]
 [ 647.35  634.86]
 [4428.75 4499.8 ]
 [3750.79 3779.25]
 [3486.62 3517.13]
 [4745.55 4673.2 ]
 [ 791.81  767.74]
 [5828.32 5540.91]
 [2200.47 2170.95]
 [ 363.61  3

## Checking the value for R2Squared over here

In [29]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8979526682042718