In [3]:
import sklearn 
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'sagemaker-tutorial-s3-bucket-test' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
df = pd.read_csv("OULAD_cleaned_data.csv")

In [None]:
# df.head()
# Run this cell to import the Data Wrangler widget to show automatic visualization and generate code to fix data quality issues

import sagemaker_datawrangler

# Display Pandas DataFrame to view the widget: df, display(df), df.sample()... 
display(df)

In [None]:
# display how many rows and columns of the dataset
df.shape

In [None]:
# show all rows of highest_education=="HE Qualification"
HE_num = len(df[df.highest_education=="HE Qualification"])
LowerA = len(df[df.highest_education=="Lower Than A Level"])
LevelA= len(df[df.highest_education=="A Level or Equivalent"])
PG_num=len(df[df.highest_education=="Post Graduate Qualification"])
NoForm=len(df[df.highest_education=="No Formal quals"])

print(HE_num)
print(LowerA)
print(LevelA)
print(PG_num)
print(NoForm)

In [None]:
# preprocess "highest_education" column, Lower Than A Level:1,A Level or Equivalent:2,HE Qualification:3, postgraduate:4,no formal quals:0
change={"No Formal quals":0, "Lower Than A Level":1, "A Level or Equivalent":2, "HE Qualification":3, "Post Graduate Qualification":4}
df["highest_education"] = df["highest_education"] .map(change)

# change code_module 
df["code_module"] = df["code_module"].map({"AAA":1, "BBB":2, "CCC":3, "DDD":4, "EEE":5, "FFF":6, "GGG":7})

# change code_presentation
df["code_presentation"] = df["code_presentation"].map({"2013B":1, "2013J":2, "2014B":3, "2014J":4})

# change gender
df["gender"] = df["gender"].map({"M":1, "F":0})

# change age_band
df["age_band"] = df["age_band"].map({"0-35":1, "35-55":2, "55<=":3})

#change disability
df["disability"] = df["disability"].map({"N":0, "Y":1})


In [None]:
# delete region and imd_band columns
newdf = df.drop(columns=["region", "imd_band"])

In [None]:
# change final_result
newdf["final_result"] = newdf["final_result"].map({"Withdrawn":0,  "Fail":1,  "Pass":2, "Distinction":3})
# delete module_presentation column
df=newdf.drop(columns=["module_presentation"])

In [None]:
# delete adjusted_mark and mark
df = df.drop(columns=["adjusted_mark","mark"])

In [None]:
# delete Unnamed column
df = df.drop(columns=["Unnamed: 0"])

In [None]:
display(df)

In [None]:
# dividing data to train data and test data
train_data, test_data = np.split(df.sample(frac=1, random_state=1729), [int(0.7 * len(df))])
print(train_data.shape, test_data.shape)

In [None]:
# write to csv file with combined tables
pd.DataFrame(train_data).to_csv('oulad_train.csv')
pd.DataFrame(test_data).to_csv('oulad_test.csv')

In [None]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/personalizedcontent/OULADcontainer"
trainpath = sess.upload_data(
    path="oulad_train.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="oulad_test.csv", bucket=bucket, key_prefix=sk_prefix
)

In [None]:
%%writefile oulad_script.py
# create training script (top line has to be the first line in a cell)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd

# inference functions ---------------

# def input_fn(request_body, request_content_type):
#     print(request_body)
#     print(request_content_type)
#     if request_content_type == "text/csv":
#         request_body = request_body.strip()
#         try:
#             df = pd.read_csv(StringIO(request_body), header=None)
#             return df
        
#         except Exception as e:
#             print(e)
#     else:
#         return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

# def predict_fn(input_data, model):
#     if type(input_data) != str:
#         prediction = model.predict(input_data)
#         print(prediction)
#         return prediction
#     else:
#         return input_data
        
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="oulad_train.csv")
    parser.add_argument("--test-file", type=str, default="oulad_test.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
   
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = "final_result"
    
    print("Building training and testing datasets")
    print()
    
    # change  "total_score*weight"  because target column has to be numerical
    
    X_train = train_df.drop( label, axis=1)
    y_train = train_df[label]
    X_test = test_df.drop(label, axis=1)
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (70%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (30%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    # print("model is --------")
    # print(model)
    # print("X_train, y_train:----------")
    # print(X_train)
    # print(y_train)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)
    
    # Tree Visualisation
from sklearn.tree import export_graphviz
# from IPython.display import Image
import IPython.display
import graphviz

# Export the first three decision trees from the forest
rf = model
for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    print("******graph******")
    print(graph)
    # Image(data=graph)
    # IPython.display.Image(graph)

In [None]:
! python oulad_script.py --n_estimators 100 \
                   --random_state 0 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

In [None]:
pip install graphviz

In [None]:
# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

# Export the first three decision trees from the forest
rf = model
for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,  
                               filled=True,  
                               max_depth=2, 
                               impurity=False, 
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)