## Importing the essential libraries over here

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso

## Check the names of all the available datasets present inside the seaborn library over here

In [2]:
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


## Importing the dataset over here

In [3]:
data=sns.load_dataset("fmri")

In [4]:
data

Unnamed: 0,subject,timepoint,event,region,signal
0,s13,18,stim,parietal,-0.017552
1,s5,14,stim,parietal,-0.080883
2,s12,18,stim,parietal,-0.081033
3,s11,18,stim,parietal,-0.046134
4,s10,18,stim,parietal,-0.037970
...,...,...,...,...,...
1059,s0,8,cue,frontal,0.018165
1060,s13,7,cue,frontal,-0.029130
1061,s12,7,cue,frontal,-0.004939
1062,s11,7,cue,frontal,-0.025367


## Taking care of missing values if present over here

In [5]:
data.isnull().sum()

subject      0
timepoint    0
event        0
region       0
signal       0
dtype: int64

In [6]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

timepoint
signal


In [8]:
data[numerical_features]

Unnamed: 0,timepoint,signal
0,18,-0.017552
1,14,-0.080883
2,18,-0.081033
3,18,-0.046134
4,18,-0.037970
...,...,...
1059,8,0.018165
1060,7,-0.029130
1061,7,-0.004939
1062,7,-0.025367


## Filtering all the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

subject
event
region


In [10]:
data[cat_features]

Unnamed: 0,subject,event,region
0,s13,stim,parietal
1,s5,stim,parietal
2,s12,stim,parietal
3,s11,stim,parietal
4,s10,stim,parietal
...,...,...,...
1059,s0,cue,frontal
1060,s13,cue,frontal
1061,s12,cue,frontal
1062,s11,cue,frontal


## Encoding the categorical features over here

In [11]:
data['subject'].value_counts()

subject
s13    76
s5     76
s12    76
s11    76
s10    76
s9     76
s8     76
s7     76
s6     76
s4     76
s3     76
s2     76
s1     76
s0     76
Name: count, dtype: int64

In [12]:
subject_mapping = {"s0": 0, "s1": 1, "s2": 2, "s3": 3, "s4": 4, "s5": 5, "s6": 6, "s7": 7, "s8": 8, "s9": 9, "s10": 10, "s11": 11, "s12": 12, "s13": 13}
data['subject']=data['subject'].map(subject_mapping)

In [13]:
data['event'].value_counts()

event
stim    532
cue     532
Name: count, dtype: int64

In [14]:
event_mapping={"stim":0,"cue":1}
data['event']=data['event'].map(event_mapping)

In [15]:
data['region'].value_counts()

region
parietal    532
frontal     532
Name: count, dtype: int64

In [16]:
region_mapping={"parietal":0,"frontal":1}
data['region']=data['region'].map(region_mapping)

In [17]:
data

Unnamed: 0,subject,timepoint,event,region,signal
0,13,18,0,0,-0.017552
1,5,14,0,0,-0.080883
2,12,18,0,0,-0.081033
3,11,18,0,0,-0.046134
4,10,18,0,0,-0.037970
...,...,...,...,...,...
1059,0,8,1,1,0.018165
1060,13,7,1,1,-0.029130
1061,12,7,1,1,-0.004939
1062,11,7,1,1,-0.025367


## Creating the features and labels over here

In [18]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here

In [20]:
regressor=XGBRegressor()
regressor.fit(X_train,y_train)

In [21]:
y_pred=regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[ 2.21e-01  2.40e-01]
 [-1.25e-02 -1.19e-02]
 [ 1.12e-02  4.48e-03]
 [-3.93e-02 -2.58e-02]
 [-2.18e-02 -6.13e-03]
 [-2.24e-02 -4.86e-02]
 [-3.57e-02 -2.40e-02]
 [ 1.00e-02 -8.37e-03]
 [-5.00e-02 -3.82e-02]
 [ 9.33e-02  1.51e-01]
 [-1.22e-02 -1.51e-03]
 [-1.19e-01 -1.29e-01]
 [-3.80e-02 -6.25e-02]
 [-4.70e-02 -3.99e-02]
 [-1.01e-02  4.70e-02]
 [-4.96e-02 -7.37e-02]
 [-2.57e-02  3.67e-02]
 [ 8.54e-02  7.30e-02]
 [ 3.41e-02  4.75e-02]
 [-5.38e-02  1.51e-02]
 [-7.48e-02 -6.50e-02]
 [ 1.23e-02  1.17e-02]
 [ 2.04e-02  1.53e-02]
 [ 8.48e-04 -1.19e-02]
 [ 1.03e-02  1.13e-02]
 [-3.17e-02 -2.41e-02]
 [ 2.71e-01  2.55e-01]
 [-1.44e-02  1.03e-02]
 [ 4.06e-02  8.93e-02]
 [-1.53e-01 -1.12e-01]
 [-7.10e-02 -3.54e-02]
 [ 1.20e-01  1.22e-01]
 [ 1.90e-02  4.95e-03]
 [-2.55e-01 -2.07e-01]
 [ 2.43e-02  2.13e-02]
 [ 9.22e-02  1.02e-01]
 [-5.68e-02 -6.85e-02]
 [-8.09e-02 -8.11e-02]
 [-3.64e-02 -4.82e-02]
 [ 2.67e-02  2.61e-02]
 [-2.75e-02 -1.53e-02]
 [ 2.34e-02  7.27e-03]
 [ 6.40e-02  8.54e-03]
 [-1.37e-02

## Calculating the absolute difference between the Actual vs Predicted over here

In [22]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

       Actual  Predicted
0    0.221493   0.239852
1   -0.012496  -0.011914
2    0.011241   0.004483
3   -0.039327  -0.025820
4   -0.021824  -0.006127
..        ...        ...
208 -0.026435  -0.014181
209  0.085000   0.080965
210 -0.007092  -0.022349
211 -0.064510  -0.046883
212 -0.075484  -0.083722

[213 rows x 2 columns]


In [23]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])

In [24]:
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,0.221493,0.239852,0.018359
1,-0.012496,-0.011914,0.000583
2,0.011241,0.004483,0.006759
3,-0.039327,-0.025820,0.013507
4,-0.021824,-0.006127,0.015697
...,...,...,...
208,-0.026435,-0.014181,0.012254
209,0.085000,0.080965,0.004034
210,-0.007092,-0.022349,0.015257
211,-0.064510,-0.046883,0.017627


## Evaluating the performance on the testing set using R2_Sqaured over here

In [25]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8798547517575547