<div style="text-align:center">
    <img src="../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 10: Decision Tree, Random Forest</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

### Decision Tree Classification

* Node: Each object in a tree. Nodes contain subsets of data, and excluding leaf nodes, a question splits the subset.
* Parent node: The question that makes a data split.
* Child node: Resulting node. It also can be a parent for its children.
* Leaf node: Final node with no further questions. Only a subset of the data representing answers to preceding questions.
* Branch: Unique line of the questions with answers that flow to a leaf node.
* Root: The top node. 

#### Decision Tree (Example)

<img src = "../files/10/DecisionTreeExercise.jpg" width=50%>

#### Information Gain & Gini Index

* Information gain is the reduction in entropy or surprise by transforming a dataset and is often used in training decision trees. Information gain is calculated by comparing the entropy of the dataset before and after a transformation.

* Gini index or Gini impurity measures the degree or probability of a particular variable being wrongly classified when it is randomly chosen. ... A Gini Index of 0.5 denotes equally distributed elements into some classes.

#### Import libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = pd.read_csv("Social_Network_Ads.csv")

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
df

In [None]:
y

#### Train and Test 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = "entropy", random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#### Visualize the tree

In [None]:
from sklearn.tree import export_text

text_representation = export_text(classifier)
print(text_representation)

In [None]:
from sklearn.tree import plot_tree

fig = plt.figure(figsize=(25,20))
_ = plot_tree(classifier, filled=True)

#### Predicting a new result

In [None]:
print(classifier.predict(sc.transform([[30, 87000]])))

#### Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Visualising the Training set results

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = sc.inverse_transform(X_train), y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Decision Tree Classification (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

#### Visualising the Test set results

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = sc.inverse_transform(X_test), y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(sc.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Decision Tree Classification (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

<hr/>

### Decision Tree Regression

#### Import libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = pd.read_csv("Position_Salaries.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values

In [None]:
X

#### Train and Test 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict([[6.5]])

In [None]:
y_pred

#### Visualising the Decision Tree Regression results (higher resolution)

In [None]:
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color = 'red')
plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

<hr/>

### Random Forest Classification

<img src = "../files/10/1_VHDtVaDPNepRglIAv72BFg.jpg" width=40%>

* The random forest is a classification algorithm consisting of many decisions trees. It uses bagging and feature randomness when building each individual tree to try to create an uncorrelated forest of trees whose prediction by committee is more accurate than that of any individual tree.

<img src = "../files/10/rfc_vs_dt1.webp" width=70%>

* Bagging (Bootstrap Aggregation) — Decisions trees are very sensitive to the data they are trained on — small changes to the training set can result in significantly different tree structures. Random forest takes advantage of this by allowing each individual tree to randomly sample from the dataset with replacement, resulting in different trees. This process is known as bagging.

<img src = "../files/10/featured_image.png" width=75%>

#### Import libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = pd.read_csv("train.csv")

#### EDA

In [None]:
df.head()

In [None]:
df.info()

#### Encoding

In [None]:
df

In [None]:
# Data Preprocessing and null values imputation
# Label Encoding

df['Gender']=df['Gender'].map({'Male':1,'Female':0})
df['Married']=df['Married'].map({'Yes':1,'No':0})
df['Education']=df['Education'].map({'Graduate':1,'Not Graduate':0})
df['Dependents'].replace('3+',3,inplace=True)
df['Self_Employed']=df['Self_Employed'].map({'Yes':1,'No':0})
df['Property_Area']=df['Property_Area'].map({'Semiurban':1,'Urban':2,'Rural':3})
df['Loan_Status']=df['Loan_Status'].map({'Y':1,'N':0})

In [None]:
# Encoding categorical data
# Encoding the Independent Variable

# from sklearn.preprocessing import LabelEncoder

# labelencoder_X = LabelEncoder()
# for i in range(0, 5):
#     X_train[:,i] = labelencoder_X.fit_transform(X_train[:,i])

# X_train[:,10] = labelencoder_X.fit_transform(X_train[:,10])
# # Encoding the Dependent Variable
# labelencoder_y = LabelEncoder()
# y_train = labelencoder_y.fit_transform(y_train)

#### Missing Value

In [None]:
df.isnull().sum()

In [None]:
df['Gender'].value_counts()

In [None]:
rev_null=['Gender','Married','Dependents','Self_Employed','Credit_History','LoanAmount','Loan_Amount_Term']
df[rev_null]=df[rev_null].replace({np.nan:df['Gender'].mode(),
                                   np.nan:df['Married'].mode(),
                                   np.nan:df['Dependents'].mode(),
                                   np.nan:df['Self_Employed'].mode(),
                                   np.nan:df['Credit_History'].mode(),
                                   np.nan:df['LoanAmount'].mean(),
                                   np.nan:df['Loan_Amount_Term'].mean()})

In [None]:
df.isnull().sum()

#### Train and Test 

In [None]:
df

In [None]:
X = df.drop(columns=['Loan_ID','Loan_Status']).values
y = df['Loan_Status'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(criterion = "entropy", random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_pred_train = classifier.predict(X_train)

#### Evaluation

In [None]:
from sklearn.metrics import accuracy_score

print("Training set accuracy: ", accuracy_score(y_train, y_pred_train))
print("Testing set accuracy: ", accuracy_score(y_test, y_pred))

In [None]:
classifier.feature_importances_

In [None]:
feature_importance=pd.DataFrame({'rfc':classifier.feature_importances_},index=df.drop(columns=['Loan_ID','Loan_Status']).columns)
feature_importance.sort_values(by='rfc',ascending=True,inplace=True)

index = np.arange(len(feature_importance))
fig, ax = plt.subplots(figsize=(18,8))
rfc_feature=ax.barh(index,feature_importance['rfc'],0.4,color='purple',label='Random Forest')
ax.set(yticks=index+0.4,yticklabels=feature_importance.index)

ax.legend()
plt.show()

> Random Forest is suitable for situations when we have a large dataset, and interpretability is not a major concern.

<hr/>

### Random Forest Regression

#### Import libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = pd.read_csv("Position_Salaries.csv")

#### EDA

In [None]:
df.head()

In [None]:
df.info()

#### Train and Test 

In [None]:
df

In [None]:
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values

For 10 trees

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X,y)
y_pred = regressor.predict([[6.5]])

In [None]:
y_pred

#### Visualize results

In [None]:
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape(len(X_grid),1) 
  
plt.scatter(X,y, color='red') #plotting real points
plt.plot(X_grid, regressor.predict(X_grid),color='blue') #plotting for predict points
  
plt.title("Truth or Bluff(Random Forest - Smooth)")
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

For 100 trees

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X,y)

#### Visualize results

In [None]:
#higher resolution graph
X_grid = np.arange(min(X),max(X),0.01)
X_grid = X_grid.reshape(len(X_grid),1) 
plt.scatter(X,y, color='red') 
  
plt.plot(X_grid, regressor.predict(X_grid),color='blue') 
plt.title("Truth or Bluff(Random Forest - Smooth)")
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

#### Predict on new input

In [None]:
y_pred = regressor.predict([[6.5]])
y_pred

For 300 trees

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 300, random_state = 0)
regressor.fit(X,y)

#### Visualize results

In [None]:
#higher resolution graph
X_grid = np.arange(min(X),max(X),0.01)
X_grid = X_grid.reshape(len(X_grid),1) 
  
plt.scatter(X,y, color='red') #plotting real points
plt.plot(X_grid, regressor.predict(X_grid),color='blue') #plotting for predict points
  
plt.title("Truth or Bluff(Random Forest - Smooth)")
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()

#### Predict on new input

In [None]:
y_pred = regressor.predict([[6.5]])
y_pred

> As you have observed, the 10 trees model predicted the salary for 6.5 years of experience to be 167,000. The 100 trees model predicted 158,300 and the 300 trees model predicted 160,333.33. Hence more the number of trees, the more accurate is our result.