## Icebreaker: Using a decision tree to predict the type of computer a student owns.

## Importing libraries 

In [23]:
from sklearn import tree
import numpy as np
import pandas as pd 

from IPython.display import display, Image

## Importing dataset

In [24]:

in_file = 'students.csv'
students = pd.read_csv(in_file)

print("Number of students: {}".format(len(students)))

display(students.head(10))


Number of students: 21


Unnamed: 0,Timestamp,name,why_nanodegree,something_interesting,price,configurations,gaming,security,designing,type_of_computer
0,2/11/2017 10:29:43,ganesh,want to obtain hands on experience,,2,2,2,1,1,mac
1,2/11/2017 10:30:14,Brook,Learn to predictive analytics,Ethiopian,1,2,2,2,2,mac
2,2/11/2017 10:30:35,vamsi,very exciting area,,2,2,2,2,1,pc
3,2/11/2017 10:30:59,Jon,Trying to make the jump into SW engineering,,1,2,2,1,2,mac
4,2/11/2017 10:32:06,Aiman,To find a new job,He enjoys playing soccer,1,1,2,1,2,mac
5,2/11/2017 10:32:52,Liang,Nano degree is unique program the others,Machine learning is becoming more and more pow...,1,2,2,2,2,mac
6,2/11/2017 10:32:53,Goldy,Want to learn machine learning application in ...,I have 9 fishes and like to spend time with th...,1,2,2,1,2,mac
7,2/11/2017 10:32:57,Samip,Interest in finance. Catching up with the new ...,pursuing CFA (Chartered Financial Analyst) and...,2,2,2,2,2,mac
8,2/11/2017 10:33:09,Karishma,Explore Machine Learning,Like to learn new things,2,2,2,2,2,mac
9,2/11/2017 10:33:53,SHAILESH,COLLABORATE,"hiking, music",2,1,2,2,2,pc


## Separating test dataset

In [25]:
train_data = students[(students.name != 'vamsi') & (students.name != 'Brook')]
test_data = students[(students.name == 'vamsi') | (students.name == 'Brook')]

print("Training dataset size:{}".format(len(train_data)))
display(train_data.head())

print("Testing dataset size:{}".format(len(test_data)))
display(test_data.head())

Training dataset size:19


Unnamed: 0,Timestamp,name,why_nanodegree,something_interesting,price,configurations,gaming,security,designing,type_of_computer
0,2/11/2017 10:29:43,ganesh,want to obtain hands on experience,,2,2,2,1,1,mac
3,2/11/2017 10:30:59,Jon,Trying to make the jump into SW engineering,,1,2,2,1,2,mac
4,2/11/2017 10:32:06,Aiman,To find a new job,He enjoys playing soccer,1,1,2,1,2,mac
5,2/11/2017 10:32:52,Liang,Nano degree is unique program the others,Machine learning is becoming more and more pow...,1,2,2,2,2,mac
6,2/11/2017 10:32:53,Goldy,Want to learn machine learning application in ...,I have 9 fishes and like to spend time with th...,1,2,2,1,2,mac


Testing dataset size:2


Unnamed: 0,Timestamp,name,why_nanodegree,something_interesting,price,configurations,gaming,security,designing,type_of_computer
1,2/11/2017 10:30:14,Brook,Learn to predictive analytics,Ethiopian,1,2,2,2,2,mac
2,2/11/2017 10:30:35,vamsi,very exciting area,,2,2,2,2,1,pc


## Remove columns

In [26]:

train_data = train_data.drop(['Timestamp', 'name','why_nanodegree','something_interesting'], axis=1)
test_data = test_data.drop(['Timestamp', 'name','why_nanodegree','something_interesting'], axis=1)

display(train_data.head(5))


Unnamed: 0,price,configurations,gaming,security,designing,type_of_computer
0,2,2,2,1,1,mac
3,1,2,2,1,2,mac
4,1,1,2,1,2,mac
5,1,2,2,2,2,mac
6,1,2,2,1,2,mac


## Separating the target variable - training data

In [27]:
train_target = train_data['type_of_computer']
train_predictors = train_data.drop('type_of_computer', axis = 1)

print("Target dataset size:{}".format(len(train_target)))
display(train_target.head(5))

print("\nPredictor dataset size:{}".format(len(train_predictors)))
display(train_predictors.head(1))

Target dataset size:19


0    mac
3    mac
4    mac
5    mac
6    mac
Name: type_of_computer, dtype: object


Predictor dataset size:19


Unnamed: 0,price,configurations,gaming,security,designing
0,2,2,2,1,1


## Separating the target variable - test data

In [28]:
test_target = test_data['type_of_computer']
test_predictors = test_data.drop('type_of_computer', axis = 1)

print("Target dataset size:{}".format(len(test_target)))
display(test_target.head(5))

print("\nPredictor dataset size:{}".format(len(test_predictors)))
display(test_predictors.head())

Target dataset size:2


1    mac
2     pc
Name: type_of_computer, dtype: object


Predictor dataset size:2


Unnamed: 0,price,configurations,gaming,security,designing
1,1,2,2,2,2
2,2,2,2,2,1


## Building the Decision Tree Classifier 

In [29]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_predictors, train_target)


## Retrieving test data

In [30]:
print("\nStudent test data:{}".format(len(test_predictors)))
display(test_predictors.head(3))
student_test_data_final = [test_predictors]
print(student_test_data_final)


Student test data:2


Unnamed: 0,price,configurations,gaming,security,designing
1,1,2,2,2,2
2,2,2,2,2,1


[   price  configurations  gaming   security  designing
1      1               2        2         2          2
2      2               2        2         2          1]


## Making predictions 

In [32]:
student_test_data = [[2, 2, 2, 2, 2], # Student 1
                     [1, 1, 1, 1, 1]] # Student 2
             


for i, computer in enumerate(clf.predict(test_predictors)):
    print "Predicted type of student {}'s computer: {}".format(i+1, computer)

print " "

for i, actual in enumerate((test_target)):
    print "Actual type of student {}'s computer: {}".format(i+1, actual)

Predicted type of student 1's computer: mac
Predicted type of student 2's computer: mac
 
Actual type of student 1's computer: mac
Actual type of student 2's computer: pc
