# Step 0: Import the necessary libraries

In [13]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
%matplotlib inline 

# Step 1: Import Data

In [2]:
try:
    file = pd.read_csv("student-por.csv")
except FileNotFoundError:
    print("File Not Found")

print(file.columns)
file.head(5)

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


### Colmns:
- 1. **school**
- 2. **sex**
- 3. **age**
- 4. **address**
- 5. **famsize**
- 6. **Pstatus**
- 7. **Medu**
- 8. **Fedu**
- 9. **Mjob**
- 10. **Fjob**
- 11. **reason**
- 12. **guardian**
- 13. **traveltime**
- 14. **studytime**
- 15. **failures**
- 16. **schoolsup**
- 17. **famsup**
- 18. **paid**
- 19. **activities**
- 20. **nursery**
- 21. **higher**
- 22. **internet**
- 23. **romantic**
- 24. **famrel**
- 25. **freetime**
- 26. **goout**
- 27. **Dalc**
- 28. **Walc**
- 29. **health**
- 30. **absences**
- 31. **G1**
- 32. **G2**
- 33. **G3**
 
### Important:


- 'sex'
- 'age'
- 'address'
- 'famsize'
- 'Pstatus'
- 'Medu'
- 'Fedu'
- 'Mjob'
- 'Fjob'
- 'guardian'
- 'studytime'
- 'failures'
- 'G1', 'G2'

## Target:
- 'G3'

# Step 2: Clean the Data

In [3]:
n_file = file.drop(columns=['school', 'reason', 'traveltime', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences'] , axis=1)
n_file.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,guardian,studytime,failures,G1,G2,G3
0,F,18,U,GT3,A,4,4,at_home,teacher,mother,2,0,0,11,11
1,F,17,U,GT3,T,1,1,at_home,other,father,2,0,9,11,11
2,F,15,U,LE3,T,1,1,at_home,other,mother,2,0,12,13,12
3,F,15,U,GT3,T,4,2,health,services,mother,3,0,14,14,14
4,F,16,U,GT3,T,3,3,other,other,father,2,0,11,13,13


In [4]:
succeeded = []
for g3 in n_file['G3'].values:
    if g3 < 10:
        succeeded.append(0)
    else:
        succeeded.append(1)

# Using 'succeeded' as the column name and equating it to the list
n_file = n_file.assign(succeeded=succeeded)
n_file.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,guardian,studytime,failures,G1,G2,G3,succeeded
0,F,18,U,GT3,A,4,4,at_home,teacher,mother,2,0,0,11,11,1
1,F,17,U,GT3,T,1,1,at_home,other,father,2,0,9,11,11,1
2,F,15,U,LE3,T,1,1,at_home,other,mother,2,0,12,13,12,1
3,F,15,U,GT3,T,4,2,health,services,mother,3,0,14,14,14,1
4,F,16,U,GT3,T,3,3,other,other,father,2,0,11,13,13,1


In [5]:
for col in ['sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'guardian']:
    n_file[col] = pd.factorize(n_file[col])[0]
    
n_file.head()

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,guardian,studytime,failures,G1,G2,G3,succeeded
0,0,18,0,0,0,4,4,0,0,0,2,0,0,11,11,1
1,0,17,0,0,1,1,1,0,1,1,2,0,9,11,11,1
2,0,15,0,1,1,1,1,0,1,0,2,0,12,13,12,1
3,0,15,0,0,1,4,2,1,2,0,3,0,14,14,14,1
4,0,16,0,0,1,3,3,2,1,1,2,0,11,13,13,1


# Step 3: Split the Data into Training/testing

In [6]:
X = np.asanyarray(n_file[['sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob',
       'Fjob', 'guardian', 'studytime', 'failures', 'G1', 'G2']])
y = np.asanyarray(n_file[["succeeded"]])
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-0.83337685,  1.03169516, -0.66018233, -0.64817536, -2.66692707,
         1.31021563,  1.54071544, -1.55645254, -1.64373691, -0.60574253,
         0.08365295, -0.37430512, -4.15547029, -0.19581961],
       [-0.83337685,  0.21013668, -0.66018233, -0.64817536,  0.37496338,
        -1.33603938, -1.18883229, -1.55645254, -0.53940435,  1.06713791,
         0.08365295, -0.37430512, -0.87456978, -0.19581961],
       [-0.83337685, -1.43298028, -0.66018233,  1.54279238,  0.37496338,
        -1.33603938, -1.18883229, -1.55645254, -0.53940435, -0.60574253,
         0.08365295, -0.37430512,  0.21906372,  0.49113675],
       [-0.83337685, -1.43298028, -0.66018233, -0.64817536,  0.37496338,
         1.31021563, -0.27898305, -0.75475595,  0.56492821, -0.60574253,
         1.29011438, -0.37430512,  0.94815272,  0.83461493],
       [-0.83337685, -0.6114218 , -0.66018233, -0.64817536,  0.37496338,
         0.42813063,  0.63086619,  0.04694063, -0.53940435,  1.06713791,
         0.08365295, -0.37

In [7]:
X_trainset, X_testset, y_trainset, y_testset = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_trainset.shape,  y_trainset.shape)
print ('Test set:', X_testset.shape,  y_testset.shape)

Train set: (519, 14) (519, 1)
Test set: (130, 14) (130, 1)


# Step 4: Create a Model

In [8]:
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree # it shows the default parameters

# Step 5: Train the Model

In [9]:
drugTree.fit(X_trainset,y_trainset)

# Step 6: Make Predictions

In [10]:
predTree = drugTree.predict(X_testset)

# Step 7: Evaluation and Improve

In [11]:
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.9384615384615385
