## Importing the essential libraries over here

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.6f}'.format)
pd.set_option('display.float_format', lambda x: "{:.6f}".format(x) if not pd.isna(x) else "NaN")

## Checking the name of the datasets present inside the seaborn over here

In [19]:
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


In [20]:
data=sns.load_dataset("dots")

In [21]:
data

Unnamed: 0,align,choice,time,coherence,firing_rate
0,dots,T1,-80,0.000000,33.189967
1,dots,T1,-80,3.200000,31.691726
2,dots,T1,-80,6.400000,34.279840
3,dots,T1,-80,12.800000,32.631874
4,dots,T1,-80,25.600000,35.060487
...,...,...,...,...,...
843,sacc,T2,300,3.200000,33.281734
844,sacc,T2,300,6.400000,27.583979
845,sacc,T2,300,12.800000,28.511530
846,sacc,T2,300,25.600000,27.009804


## Taking care of missing values if present over here

In [22]:
data.isnull().sum()

align          0
choice         0
time           0
coherence      0
firing_rate    0
dtype: int64

In [24]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [25]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

time
coherence
firing_rate


In [26]:
data[numerical_features]

Unnamed: 0,time,coherence,firing_rate
0,-80,0.000000,33.189967
1,-80,3.200000,31.691726
2,-80,6.400000,34.279840
3,-80,12.800000,32.631874
4,-80,25.600000,35.060487
...,...,...,...
843,300,3.200000,33.281734
844,300,6.400000,27.583979
845,300,12.800000,28.511530
846,300,25.600000,27.009804


## Filtering the categorical features over here

In [28]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

align
choice


In [29]:
data[cat_features]

Unnamed: 0,align,choice
0,dots,T1
1,dots,T1
2,dots,T1
3,dots,T1
4,dots,T1
...,...,...
843,sacc,T2
844,sacc,T2
845,sacc,T2
846,sacc,T2


## Encoding the categorical features over here

In [30]:
data['align'].value_counts()

align
sacc    454
dots    394
Name: count, dtype: int64

In [31]:
align_mapping={"sacc":0,"dots":1}
data['align']=data['align'].map(align_mapping)

In [32]:
data['choice'].value_counts()

choice
T2    430
T1    418
Name: count, dtype: int64

In [33]:
choice_mapping={"T1":0,"T2":1}
data['choice']=data['choice'].map(choice_mapping)

In [34]:
data

Unnamed: 0,align,choice,time,coherence,firing_rate
0,1,0,-80,0.000000,33.189967
1,1,0,-80,3.200000,31.691726
2,1,0,-80,6.400000,34.279840
3,1,0,-80,12.800000,32.631874
4,1,0,-80,25.600000,35.060487
...,...,...,...,...,...
843,0,1,300,3.200000,33.281734
844,0,1,300,6.400000,27.583979
845,0,1,300,12.800000,28.511530
846,0,1,300,25.600000,27.009804


## Creating the feature and labels over here

In [35]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem of overfitting over here

In [40]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [43]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso

In [41]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor()
regressor.fit(X_train,y_train)

In [44]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[34.71 33.85]
 [41.56 44.18]
 [34.72 35.81]
 [45.23 46.67]
 [61.36 59.32]
 [31.55 32.98]
 [51.97 50.56]
 [31.27 30.78]
 [67.65 67.82]
 [27.2  27.59]
 [42.76 42.64]
 [66.24 67.41]
 [45.36 46.76]
 [53.89 55.99]
 [52.81 49.35]
 [33.88 35.53]
 [36.19 35.52]
 [41.91 40.26]
 [41.84 41.56]
 [62.69 61.18]
 [24.64 24.15]
 [34.84 35.01]
 [49.89 48.96]
 [33.46 33.03]
 [47.9  48.58]
 [33.48 34.64]
 [46.48 45.57]
 [29.95 28.72]
 [51.27 54.2 ]
 [40.44 40.82]
 [30.75 31.94]
 [50.01 50.69]
 [11.47 10.52]
 [43.29 41.94]
 [57.27 55.2 ]
 [27.58 30.51]
 [30.57 31.27]
 [38.24 37.49]
 [38.21 37.84]
 [40.41 39.59]
 [36.03 37.84]
 [31.36 32.45]
 [38.32 37.28]
 [34.41 35.22]
 [45.43 44.68]
 [48.98 47.3 ]
 [29.17 29.68]
 [35.7  36.36]
 [27.54 28.56]
 [41.26 39.11]
 [55.63 55.5 ]
 [38.13 37.27]
 [36.53 36.19]
 [30.7  31.75]
 [34.96 35.39]
 [47.37 47.46]
 [56.6  56.21]
 [43.07 43.37]
 [40.3  39.45]
 [29.64 29.17]
 [44.47 44.46]
 [33.33 33.01]
 [28.86 28.31]
 [36.37 35.34]
 [33.28 31.99]
 [62.8  60.03]
 [25.9  23

In [45]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9876923628761693