In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
student_data = pd.read_csv("student_data.csv")
student_data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,passed
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,no,no,4,3,4,1,1,3,6,no
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,no,5,3,3,1,1,3,4,no
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,no,4,3,2,2,3,3,10,yes
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,3,2,2,1,1,5,2,yes
4,GP,F,16,U,GT3,T,3,3,other,other,...,no,no,4,3,2,1,2,5,4,yes


In [4]:
student_data.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'passed'],
      dtype='object')

In [5]:
# TODO: Calculate number of students
n_students = student_data.shape[0]

# TODO: Calculate number of features
n_features = student_data.shape[1]

# TODO: Calculate passing students
n_passed = student_data[student_data['passed']=='yes'].shape[0]

# TODO: Calculate failing students
n_failed = student_data[student_data['passed']=='no'].shape[0]

# TODO: Calculate graduation rate
grad_rate = n_passed/n_students*100

# Print the results
print("Total number of students: {}".format(n_students))
print("Number of features: {}".format(n_features))
print("Number of students who passed: {}".format(n_passed))
print("Number of students who failed: {}".format(n_failed))
print("Graduation rate of the class: {:.2f}%".format(grad_rate))

Total number of students: 395
Number of features: 31
Number of students who passed: 265
Number of students who failed: 130
Graduation rate of the class: 67.09%


In [6]:
# Extract feature columns
feature_cols = list(student_data.columns[:-1])

# Extract target column 'passed'
target_col = student_data.columns[-1] 

# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))
print("\nTarget column: {}".format(target_col))

# Separate the data into feature data and target data (X_all and y_all, respectively)
X = student_data[feature_cols]
y = student_data[target_col]

Feature columns:
['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']

Target column: passed


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [8]:
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.items():

        # If data type is non-numeric, replace all binary values with 1/0
        if col_data.dtype == object and len(col_data.unique()) == 2:
            print("*****binary*****")
            print("col name: ", col, end="----")
            col_data_unique = col_data.unique()
            print("unique values: ", col_data_unique)
            col_data = col_data.replace(col_data_unique, [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object and len(col_data.unique()) != 2:
            print("*****categorical*****")
            print("col name: ", col, end="----")
            col_data_unique = col_data.unique()
            print("unique values: ", col_data_unique)
            # Example: 'school' => 'school_GP' and 'school_MS'
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output

X_preprocessed = preprocess_features(X)

*****binary*****
col name:  school----unique values:  ['GP' 'MS']
*****binary*****
col name:  sex----unique values:  ['F' 'M']
*****binary*****
col name:  address----unique values:  ['U' 'R']
*****binary*****
col name:  famsize----unique values:  ['GT3' 'LE3']
*****binary*****
col name:  Pstatus----unique values:  ['A' 'T']
*****categorical*****
col name:  Mjob----unique values:  ['at_home' 'health' 'other' 'services' 'teacher']
*****categorical*****
col name:  Fjob----unique values:  ['teacher' 'other' 'services' 'health' 'at_home']
*****categorical*****
col name:  reason----unique values:  ['course' 'other' 'home' 'reputation']
*****categorical*****
col name:  guardian----unique values:  ['mother' 'father' 'other']
*****binary*****
col name:  schoolsup----unique values:  ['yes' 'no']
*****binary*****
col name:  famsup----unique values:  ['no' 'yes']
*****binary*****
col name:  paid----unique values:  ['no' 'yes']
*****binary*****
col name:  activities----unique values:  ['no' 'yes']


In [9]:
# TODO: Import any additional functionality you may need here
from sklearn.model_selection import train_test_split
# TODO: split the dataset into the number of training and testing
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed,y,test_size=0.25,random_state=42)

## model

### gaussian

In [10]:
g_m = GaussianNB()
g_m.fit(X_train,y_train)
print(classification_report(y_test,g_m.predict(X_test)))

              precision    recall  f1-score   support

          no       0.62      0.36      0.46        36
         yes       0.71      0.87      0.78        63

    accuracy                           0.69        99
   macro avg       0.66      0.62      0.62        99
weighted avg       0.67      0.69      0.66        99



### KNN

In [11]:
knn_m = KNeighborsClassifier()
knn_m.fit(X_train,y_train)
print(classification_report(y_test,knn_m.predict(X_test)))

              precision    recall  f1-score   support

          no       0.59      0.28      0.38        36
         yes       0.68      0.89      0.77        63

    accuracy                           0.67        99
   macro avg       0.64      0.58      0.57        99
weighted avg       0.65      0.67      0.63        99



### SGDC

In [12]:
sgd_m = SGDClassifier()
sgd_m.fit(X_train,y_train)
print(classification_report(y_test,sgd_m.predict(X_test)))

              precision    recall  f1-score   support

          no       0.61      0.39      0.47        36
         yes       0.71      0.86      0.78        63

    accuracy                           0.69        99
   macro avg       0.66      0.62      0.63        99
weighted avg       0.67      0.69      0.67        99



### SVM

In [13]:
svm_m = SVC()
svm_m.fit(X_train,y_train)
print(classification_report(y_test,svm_m.predict(X_test)))

              precision    recall  f1-score   support

          no       0.00      0.00      0.00        36
         yes       0.64      1.00      0.78        63

    accuracy                           0.64        99
   macro avg       0.32      0.50      0.39        99
weighted avg       0.40      0.64      0.49        99



### logistic

In [14]:
l_m = LogisticRegression()
l_m.fit(X_train,y_train)
print(classification_report(y_test,l_m.predict(X_test)))

              precision    recall  f1-score   support

          no       0.68      0.36      0.47        36
         yes       0.71      0.90      0.80        63

    accuracy                           0.71        99
   macro avg       0.70      0.63      0.63        99
weighted avg       0.70      0.71      0.68        99

