In [1]:
#Classify the target as pass or fail

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib 

In [2]:
df = pd.read_csv('student-por.csv')
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


In [3]:
df_dropped = df[['sex', 'age', 'Medu', 'reason', 'traveltime', 'studytime', 'freetime', 'higher',
       'failures', 'internet', 'G3']]
df_dropped

Unnamed: 0,sex,age,Medu,reason,traveltime,studytime,freetime,higher,failures,internet,G3
0,F,18,4,course,2,2,3,yes,0,no,11
1,F,17,1,course,1,2,3,yes,0,yes,11
2,F,15,1,other,1,2,3,yes,0,yes,12
3,F,15,4,home,1,3,2,yes,0,yes,14
4,F,16,3,home,1,2,3,yes,0,no,13
...,...,...,...,...,...,...,...,...,...,...,...
644,F,19,2,course,1,3,4,yes,1,yes,10
645,F,18,3,course,1,2,3,yes,0,yes,16
646,F,18,1,course,2,2,1,yes,0,no,9
647,M,17,3,course,2,1,4,yes,0,yes,10


In [4]:
df_dropped['G3_binary'] = pd.cut(x = df_dropped['G3'], bins=[df_dropped['G3'].min()-1, df_dropped['G3'].mean(), df_dropped['G3'].max()+1], labels=['fail', 'pass'])
df_dropped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dropped['G3_binary'] = pd.cut(x = df_dropped['G3'], bins=[df_dropped['G3'].min()-1, df_dropped['G3'].mean(), df_dropped['G3'].max()+1], labels=['fail', 'pass'])


Unnamed: 0,sex,age,Medu,reason,traveltime,studytime,freetime,higher,failures,internet,G3,G3_binary
0,F,18,4,course,2,2,3,yes,0,no,11,fail
1,F,17,1,course,1,2,3,yes,0,yes,11,fail
2,F,15,1,other,1,2,3,yes,0,yes,12,pass
3,F,15,4,home,1,3,2,yes,0,yes,14,pass
4,F,16,3,home,1,2,3,yes,0,no,13,pass
...,...,...,...,...,...,...,...,...,...,...,...,...
644,F,19,2,course,1,3,4,yes,1,yes,10,fail
645,F,18,3,course,1,2,3,yes,0,yes,16,pass
646,F,18,1,course,2,2,1,yes,0,no,9,fail
647,M,17,3,course,2,1,4,yes,0,yes,10,fail


In [6]:
#label encoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df_dropped['G3_binary_label'] = label_encoder.fit_transform(df_dropped['G3_binary'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dropped['G3_binary_label'] = label_encoder.fit_transform(df_dropped['G3_binary'])


In [7]:
from sklearn.model_selection import train_test_split

#Split the data into train and test
train_data, test_data = train_test_split(df_dropped, test_size=0.1, random_state=42)

#Ready X and Ys
X_train = train_data[['sex', 'age', 'Medu', 'reason', 'traveltime', 'studytime', 'freetime', 'higher',
       'failures', 'internet']]
y_train = train_data['G3_binary_label']

X_test = test_data[['sex', 'age', 'Medu', 'reason', 'traveltime', 'studytime', 'freetime', 'higher',
       'failures', 'internet']]
y_test = test_data['G3_binary_label']

In [8]:
#Boosting
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier

#Pre-processing pipeline

# Scale numerical values: 
num_transformer = Pipeline([('standard_scaler', StandardScaler())])

# Encode categorical values
cat_transformer = OneHotEncoder()

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age','Medu', 'traveltime', 'studytime', 'freetime', 'failures']),
    ('cat_transformer', cat_transformer, ['sex', 'reason', 'higher', 'internet']),
])

model = GradientBoostingClassifier()

pipeline4 = Pipeline([
    ('preprocessor', preprocessor),
    ('boosting', model),
])

pipeline4

In [9]:
pipeline4.fit(X_train, y_train)
y_predict4 = pipeline4.predict(X_test)

#precision
from sklearn.metrics import precision_score

precision_score(y_test, y_predict4)

0.7608695652173914

In [10]:
from joblib import dump

dump(pipeline4, 'pipeline4.joblib')

['pipeline4.joblib']

In [11]:
testing_pipeline = joblib.load('pipeline4.joblib')
testing_pipeline

In [12]:
testing_pipeline.classes_ #this way it has been fitted 

array([0, 1])