## Importing the essential libraries over here

In [30]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso


## Checking the available dataset in seaborn library over here

In [31]:
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


## Importing the dataset over here

In [32]:
data=sns.load_dataset("glue")

In [33]:
data.head(2)

Unnamed: 0,Model,Year,Encoder,Task,Score
0,ERNIE,2019,Transformer,CoLA,75.5
1,T5,2019,Transformer,CoLA,71.6


## Taking care of missing values if present over here

In [34]:
data.isnull().sum()

Model      0
Year       0
Encoder    0
Task       0
Score      0
dtype: int64

In [35]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

## Filtering all the numerical features over here

In [36]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

Year
Score


In [37]:
data[numerical_features].head(2)

Unnamed: 0,Year,Score
0,2019,75.5
1,2019,71.6


## Filtering all the categorical features over here

In [38]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

Model
Encoder
Task


In [39]:
data[cat_features].head(2)

Unnamed: 0,Model,Encoder,Task
0,ERNIE,Transformer,CoLA
1,T5,Transformer,CoLA


## Encoding the categorical features into numerical representations over here

In [40]:
data['Model'].value_counts()

Model
ERNIE          8
T5             8
RoBERTa        8
BERT           8
BiLSTM+ELMo    8
BiLSTM+CoVe    8
BiLSTM+Attn    8
BiLSTM         8
Name: count, dtype: int64

In [41]:
model_mapping = {
    "ERNIE": 0,
    "T5": 1,
    "RoBERTa": 2,
    "BERT": 3,
    "BiLSTM+ELMo": 4,
    "BiLSTM+CoVe": 5,
    "BiLSTM+Attn": 6,
    "BiLSTM": 7
}

In [42]:
data['Model']=data['Model'].map(model_mapping)

In [43]:
data['Encoder'].value_counts()

Encoder
Transformer    32
LSTM           32
Name: count, dtype: int64

In [44]:
encoder_mapping = {
    "Transformer": 0,
    "LSTM": 1
}

In [45]:
data['Encoder']=data['Encoder'].map(encoder_mapping)

In [46]:
data['Task'].value_counts()


Task
CoLA     8
SST-2    8
MRPC     8
STS-B    8
QQP      8
MNLI     8
QNLI     8
RTE      8
Name: count, dtype: int64

In [47]:
task_mapping = {
    "CoLA": 0,
    "SST-2": 1,
    "MRPC": 2,
    "STS-B": 3,
    "QQP": 4,
    "MNLI": 5,
    "QNLI": 6,
    "RTE": 7
}

In [48]:
data['Task']=data['Task'].map(task_mapping)

In [49]:
data.columns

Index(['Model', 'Year', 'Encoder', 'Task', 'Score'], dtype='object')

In [50]:
data['Year'].value_counts()

Year
2019    24
2017    24
2018    16
Name: count, dtype: int64

In [51]:
year_mapping = {
    2019: 0,
    2017: 1,
    2018: 2

}

In [52]:
data['Year']=data['Year'].map(year_mapping)

In [53]:
data

Unnamed: 0,Model,Year,Encoder,Task,Score
0,0,0,0,0,75.5
1,1,0,0,0,71.6
2,2,0,0,0,67.8
3,3,2,0,0,60.5
4,4,2,1,0,32.1
...,...,...,...,...,...
59,3,2,0,7,70.1
60,4,2,1,7,57.4
61,5,1,1,7,52.7
62,6,1,1,7,58.4


## Creating the features and labels over here

In [54]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here

In [55]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model over here

In [56]:
regressor=XGBRegressor()
regressor.fit(X_train,y_train)

## Evaluating the models on the testing dataset over here ie checking the actual vs predicted values for dependent variable

In [57]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[65.4  65.79]
 [64.4  72.23]
 [86.7  86.87]
 [52.7  56.45]
 [74.3  78.82]
 [75.1  79.07]
 [70.3  72.77]
 [92.3  91.08]
 [92.2  93.09]
 [58.4  57.84]
 [83.9  82.05]
 [67.8  60.51]
 [94.9  93.28]]


## Checking the R2Sqaured Score over here

In [58]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.92042496476556