## Importing the essential libraries over here

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
print(sns.get_dataset_names())

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']


## Importing the dataset over here

In [5]:
data=sns.load_dataset("anagrams")

In [6]:
data.head()

Unnamed: 0,subidr,attnr,num1,num2,num3
0,1,divided,2,4.0,7
1,2,divided,3,4.0,5
2,3,divided,3,5.0,6
3,4,divided,5,7.0,5
4,5,divided,4,5.0,8


## Taking care of duplicate observations over here

In [7]:
data.duplicated().sum()

0

## Taking care of missing values over here


In [8]:
data.isnull().sum()

subidr    0
attnr     0
num1      0
num2      0
num3      0
dtype: int64

In [10]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

subidr
num1
num2
num3


In [11]:
data[numerical_features]

Unnamed: 0,subidr,num1,num2,num3
0,1,2,4.0,7
1,2,3,4.0,5
2,3,3,5.0,6
3,4,5,7.0,5
4,5,4,5.0,8
5,6,5,5.0,6
6,7,5,4.5,6
7,8,5,7.0,8
8,9,2,3.0,7
9,10,6,5.0,6


In [12]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)

attnr


In [13]:
data[cat_features]

Unnamed: 0,attnr
0,divided
1,divided
2,divided
3,divided
4,divided
5,divided
6,divided
7,divided
8,divided
9,divided


## Encoding the categorical features over here

In [15]:
for feature in cat_features:
  feature_mapping={value:index for index,value in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [16]:
data

Unnamed: 0,subidr,attnr,num1,num2,num3
0,1,0,2,4.0,7
1,2,0,3,4.0,5
2,3,0,3,5.0,6
3,4,0,5,7.0,5
4,5,0,4,5.0,8
5,6,0,5,5.0,6
6,7,0,5,4.5,6
7,8,0,5,7.0,8
8,9,0,2,3.0,7
9,10,0,6,5.0,6


In [17]:
data['attnr'].value_counts()

attnr
0    10
1    10
Name: count, dtype: int64

## Creating the features and labels over here

In [18]:
data['ATTR']=data['attnr']
data.drop('attnr',axis=1,inplace=True)

In [19]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting over here

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)

In [22]:
y_pred=classifier.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [1 1]
 [0 0]]


In [23]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

1.0