# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint

## Learning Objective

At the end of the experiment, you will be able to:

* understand how to implement Bagging , Boosting and Voting classifier

In [26]:
#@title Experiment Explanation Video
from IPython.display import HTML

HTML("""<video width="800" height="400" controls>
  <source src="https://cdn.iiith.talentsprint.com/aiml/Experiment_related_data/Walkthrough/Ensemble_Methods_Walkthrough.mp4" type="video/mp4">
</video>
""")

## **Dataset**
Our dataset includes 14,999 observations, with each row representing one single employee.


Fields in the dataset include the following 10 variables for each line:
- Employee satisfaction level
- Last evaluation score
- Number of projects
- Average monthly hours
- Time spent at the company
- Whether they have had a work accident
- Whether they have had a promotion in the last 5 years
- Department
- Salary
- Whether the employee has left



### Setup Steps

In [11]:
#@title Please enter your registration id to start: { run: "auto", display-mode: "form" }
Id = "2100121" #@param {type:"string"}


In [12]:
#@title Please enter your password (normally your phone number) to continue: { run: "auto", display-mode: "form" }
password = "5142192291" #@param {type:"string"}


In [13]:
#@title Run this cell to complete the setup for this Notebook
from IPython import get_ipython

ipython = get_ipython()
  
notebook= "U3W16_35_Ensemble_Methods_B" #name of the notebook
def setup():
    ipython.magic("sx wget https://cdn.talentsprint.com/aiml/Experiment_related_data/HR_comma_sep.csv")
    from IPython.display import HTML, display
    display(HTML('<script src="https://dashboard.talentsprint.com/aiml/record_ip.html?traineeId={0}&recordId={1}"></script>'.format(getId(),submission_id)))
    print ("Setup completed successfully")
    return

def submit_notebook():
    ipython.magic("notebook -e "+ notebook + ".ipynb")
    
    import requests, json, base64, datetime

    url = "https://dashboard.talentsprint.com/xp/app/save_notebook_attempts"
    if not submission_id:
      data = {"id" : getId(), "notebook" : notebook, "mobile" : getPassword()}
      r = requests.post(url, data = data)
      r = json.loads(r.text)

      if r["status"] == "Success":
          return r["record_id"]
      elif "err" in r:        
        print(r["err"])
        return None        
      else:
        print ("Something is wrong, the notebook will not be submitted for grading")
        return None
    
    elif getAnswer() and getComplexity() and getAdditional() and getConcepts() and getWalkthrough() and getComments() and getMentorSupport():
      f = open(notebook + ".ipynb", "rb")
      file_hash = base64.b64encode(f.read())

      data = {"complexity" : Complexity, "additional" :Additional, 
              "concepts" : Concepts, "record_id" : submission_id, 
              "answer" : Answer, "id" : Id, "file_hash" : file_hash,
              "notebook" : notebook, "feedback_walkthrough":Walkthrough ,
              "feedback_experiments_input" : Comments,
              "feedback_mentor_support": Mentor_support}

      r = requests.post(url, data = data)
      r = json.loads(r.text)
      if "err" in r:        
        print(r["err"])
        return None   
      else:
        print("Your submission is successful.")
        print("Ref Id:", submission_id)
        print("Date of submission: ", r["date"])
        print("Time of submission: ", r["time"])
        print("View your submissions: https://aiml.iiith.talentsprint.com/notebook_submissions")
        #print("For any queries/discrepancies, please connect with mentors through the chat icon in LMS dashboard.")
        return submission_id
    else: submission_id
    

def getAdditional():
  try:
    if not Additional: 
      raise NameError
    else:
      return Additional  
  except NameError:
    print ("Please answer Additional Question")
    return None

def getComplexity():
  try:
    if not Complexity:
      raise NameError
    else:
      return Complexity
  except NameError:
    print ("Please answer Complexity Question")
    return None
  
def getConcepts():
  try:
    if not Concepts:
      raise NameError
    else:
      return Concepts
  except NameError:
    print ("Please answer Concepts Question")
    return None
  
  
def getWalkthrough():
  try:
    if not Walkthrough:
      raise NameError
    else:
      return Walkthrough
  except NameError:
    print ("Please answer Walkthrough Question")
    return None
  
def getComments():
  try:
    if not Comments:
      raise NameError
    else:
      return Comments
  except NameError:
    print ("Please answer Comments Question")
    return None
  

def getMentorSupport():
  try:
    if not Mentor_support:
      raise NameError
    else:
      return Mentor_support
  except NameError:
    print ("Please answer Mentor support Question")
    return None

def getAnswer():
  try:
    if not Answer:
      raise NameError 
    else: 
      return Answer
  except NameError:
    print ("Please answer Question")
    return None
  

def getId():
  try: 
    return Id if Id else None
  except NameError:
    return None

def getPassword():
  try:
    return password if password else None
  except NameError:
    return None

submission_id = None
### Setup 
if getPassword() and getId():
  submission_id = submit_notebook()
  if submission_id:
    setup() 
else:
  print ("Please complete Id and Password cells before running setup")



Setup completed successfully


### Importing required packages

In [14]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

### Loading the data

In [15]:
df = pd.read_csv("/content/HR_comma_sep.csv")
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [16]:
# Checking for any missing values
display(df.isnull().any())

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool

In [17]:
# Dataset contains 14999 rows and 10 columns, each row has the details of an employee. 2 variables are categorical, remaining columns are of int and float
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [18]:
# YOUR CODE HERE: To drop the 'sales' column from the dataframe
df['salary'] = df['salary'].map({'low':1, 'medium':2, 'high':3})
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,1
1,0.8,0.86,5,262,6,0,1,0,sales,2
2,0.11,0.88,7,272,4,0,1,0,sales,2
3,0.72,0.87,5,223,5,0,1,0,sales,1
4,0.37,0.52,2,159,3,0,1,0,sales,1


In [19]:
# Checking for any missing values
display(df.isnull().any())

satisfaction_level       False
last_evaluation          False
number_project           False
average_montly_hours     False
time_spend_company       False
Work_accident            False
left                     False
promotion_last_5years    False
sales                    False
salary                   False
dtype: bool

In [20]:
# Dataset contains 14999 rows and 10 columns, each row has the details of an employee. 2 variables are categorical, remaining columns are of int and float
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  int64  
dtypes: float64(2), int64(7), object(1)
memory usage: 1.1+ MB


In [21]:
df = df.drop(["sales"], axis=1)
df['salary'] = df['salary'].map({'low':1, 'medium':2, 'high':3})
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary
0,0.38,0.53,2,157,3,0,1,0,
1,0.8,0.86,5,262,6,0,1,0,
2,0.11,0.88,7,272,4,0,1,0,
3,0.72,0.87,5,223,5,0,1,0,
4,0.37,0.52,2,159,3,0,1,0,


In [22]:
Y = df["left"]
X = df.drop(columns="left", axis=1)

In [23]:
# Train and test split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=1)

### Applying Decision Tree Classifier

In [24]:
clf = DecisionTreeClassifier(max_depth=2) # YOUR CODE HERE: To create DecisionTree Classifier object  with 'max_depth' as parameter

# YOUR CODE HERE: To train the model 
clf.fit(X_train, y_train) 

# YOUR CODE HERE: To get predictions on the test set 
pred = clf.predict(X_test)

# Comparing actual response values (y_test)
# with predicted response values (y_pred) 
acc_DT = accuracy_score(y_test,pred)

ValueError: ignored

In [None]:
# Store the accuracy results for each model in a dataframe for final comparison
results = pd.DataFrame({'classifier':['Decision Tree'], 'accuracy': acc_DT},index={'1'})
results = results[['classifier', 'accuracy']]
results

### Applying Random Forest Classifier

In [None]:
model = # YOUR CODE HERE: To create RandomForest Classifier object  with 100 estimators

# Train the model 
model.fit(X_train, y_train)

# Making predictions on the testing set 
y_pred = model.predict(X_test)

# Comparing actual response values (y_test)
# with predicted response values (y_pred) 
acc_RF = accuracy_score(y_test, y_pred)

In [None]:
# Store the accuracy results for each model in a dataframe for final comparison
results_rf = pd.DataFrame({'classifier':['Random Forest'], 'accuracy': [acc_RF]},index={'2'})
results = pd.concat([results, results_rf])
results = results[['classifier', 'accuracy']]
results

### Applying Bagging Classifier

Bagging is an abbreviation for "bootstrap aggregating".which takes M subsamples (with replacement) from the initial dataset and trains the predictive model on those subsamples. The final model is obtained by averaging the "bootstrapped" models and usually yields better results.

In [None]:
bg = # YOUR CODE HERE: To create BaggingClassifier object with base estimator as DecisionTree classifier and 20 estimators

# Train the model 
bg.fit(X_train, y_train)

# Making predictions on the testing set 
y_pred = bg.predict(X_test)

# Comparing actual response values (y_test)
# with predicted response values (y_pred) 
acc_bg = accuracy_score(y_test, y_pred)

In [None]:
Results_bg = pd.DataFrame({'classifier':['Bagging'], 'accuracy': [acc_bg]},index={'3'})
results = pd.concat([results, Results_bg])
results = results[['classifier', 'accuracy']]
results

## Boosting
### Applying AdaBoost Classifier

Boosting refers to a family of algorithms that are able to convert weak learners to strong learners. The main principle of boosting is to fit a sequence of weak learners− models that are only slightly better than random guessing, such as small decision trees− to weighted versions of the data. More weight is given to examples that were misclassified by earlier rounds. The predictions are then combined through a weighted majority vote (classification) or a weighted sum (regression) to produce the final prediction.

In [None]:
adb = # YOUR CODE HERE: To create AdaBoostClassifier object with base estimator as DecisionTree classifier and 10 estimators

# Train the model 
adb.fit(X_train,y_train)

# Making predictions on the testing set 
y_pred = adb.predict(X_test)

# Comparing actual response values (y_test)
# with predicted response values (y_pred) 
acc_adb = accuracy_score(y_test, y_pred)

In [None]:
Results_adb = pd.DataFrame({'classifier':['Adaboost'], 'accuracy': [acc_adb]},index={'4'})
results = pd.concat([results, Results_adb])
results = results[['classifier', 'accuracy']]
results

### Applying Voting Classifier

The idea behind the voting classifier implementation is to combine conceptually different machine learning classifiers and use a majority vote or the average predicted probabilitie to predict the class labels. 

In [None]:
vc = VotingClassifier(estimators=[('dt',DecisionTreeClassifier()), ('rf', RandomForestClassifier())], voting='hard')

vc.fit(X_train,y_train)

# Making predictions on the testing set 
y_pred = vc.predict(X_test)


# Comparing actual response values (y_test)
# with predicted response values (y_pred) 
acc_Ensemble = accuracy_score(y_test, y_pred)

In [None]:
results_ensemble = pd.DataFrame({'classifier':['Ensemble'], 'accuracy': [acc_Ensemble]},index={'5'})
results = pd.concat([results, results_ensemble])
results = results[['classifier', 'accuracy']]
results

### Please answer the questions below to complete the experiment:

In [1]:
#@title  State True or False: In boosting, combining strong learners sequentially makes optimal predictive model { run: "auto", form-width: "500px", display-mode: "form" }
Answer = "FALSE" #@param ["","TRUE","FALSE"]


In [2]:
#@title How was the experiment? { run: "auto", form-width: "500px", display-mode: "form" }
Complexity = "Good, But Not Challenging for me" #@param ["","Too Simple, I am wasting time", "Good, But Not Challenging for me", "Good and Challenging for me", "Was Tough, but I did it", "Too Difficult for me"]


In [4]:
#@title If it was too easy, what more would you have liked to be added? If it was very difficult, what would you have liked to have been removed? { run: "auto", display-mode: "form" }
Additional = "nn" #@param {type:"string"}


In [5]:
#@title Can you identify the concepts from the lecture which this experiment covered? { run: "auto", vertical-output: true, display-mode: "form" }
Concepts = "Yes" #@param ["","Yes", "No"]


In [6]:
#@title  Experiment walkthrough video? { run: "auto", vertical-output: true, display-mode: "form" }
Walkthrough = "Very Useful" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [7]:
#@title  Text and image description/explanation and code comments within the experiment: { run: "auto", vertical-output: true, display-mode: "form" }
Comments = "Very Useful" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [8]:
#@title Mentor Support: { run: "auto", vertical-output: true, display-mode: "form" }
Mentor_support = "Very Useful" #@param ["","Very Useful", "Somewhat Useful", "Not Useful", "Didn't use"]


In [25]:
#@title Run this cell to submit your notebook for grading { vertical-output: true }
try:
  if submission_id:
      return_id = submit_notebook()
      if return_id : submission_id = return_id
  else:
      print("Please complete the setup first.")
except NameError:
  print ("Please complete the setup first.")

Your submission is successful.
Ref Id: 12927
Date of submission:  02 Jan 2021
Time of submission:  16:01:44
View your submissions: https://aiml.iiith.talentsprint.com/notebook_submissions
