In [1]:
#surpress warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [3]:
pd.set_option('display.max_columns', 160)
df = pd.read_excel('IOF_2014-15.xls',sheet_name='IOF_2014-15')
df = df[(df['age'] >= 6) & (df['age'] <= 18)]

In [4]:
cols_req = ['district', 'food_pc_nom', 'nf_pc_nom', 'mean_cal', 'relhead', 'gender', 'age', 'slept_away_3', 'marital_status', 'read_write', 'frequented_school', 'education_level_attended', 'grade_or_year', 'studies_now', 'education_level_attending', 'gradeoryear_attending', 'school_problem_a', 'school_problem_b', 'school_problem_c', 'school_problem_d', 'school_problem_e', 'school_problem_f', 'year_started_school', 'notstudying_reason', 'frequented_school_12', 'deficiency_a', 'deficiency_b', 'deficiency_c', 'deficiency_d', 'deficiency_e', 'deficiency_f', 'deficiency_h', 'water_source', 'time_facitlity_d', 'sanitation_type', 'roof_type', 'wall_type', 'occupation', 'worker_type', 'sector' ]
new_df = df[cols_req]

In [5]:
cat_df = df[cols_req]

In [6]:
#drop rows in new_df where slep_away_3 is null, ONLY LOST 1 ROW

new_df.dropna(subset=['slept_away_3'], inplace=True)
cat_df.dropna(subset=['slept_away_3'], inplace=True)
new_df.shape

(4733, 40)

In [7]:
#split in train and test

X_train, X_test, y_train, y_test = train_test_split(new_df.drop('studies_now', axis=1), new_df['studies_now'], test_size=0.2, random_state=0)


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_train = pd.Series(y_train)

y_test = le.transform(y_test)
y_test = pd.Series(y_test)
cat_df['studies_now'] = le.transform(cat_df['studies_now'])

In [9]:
#drop notstudying_reason and frequented_school_12 from X_train 
X_train.drop(['notstudying_reason', 'frequented_school_12','mean_cal'], axis=1, inplace=True)
cat_df.drop(['notstudying_reason', 'frequented_school_12','mean_cal'], axis=1, inplace=True)
X_train['food_pc_nom'] = X_train['food_pc_nom'] + 0.0001
cat_df['food_pc_nom'] = cat_df['food_pc_nom'] + 0.0001
X_test.drop(['notstudying_reason', 'frequented_school_12','mean_cal'], axis=1, inplace=True)
X_test['food_pc_nom'] = X_test['food_pc_nom'] + 0.0001

In [10]:
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()

In [11]:
X_train[categorical_cols].isnull().mean()*100

district                      0.000000
relhead                       0.000000
gender                        0.000000
slept_away_3                  0.000000
marital_status               52.430005
read_write                    0.026413
frequented_school             0.000000
education_level_attended      0.000000
grade_or_year                17.987322
education_level_attending    29.978870
school_problem_a             42.762810
school_problem_b             42.789223
school_problem_c             42.762810
school_problem_d             42.762810
school_problem_e             42.762810
school_problem_f             42.762810
deficiency_a                  0.000000
deficiency_b                  0.000000
deficiency_c                  0.000000
deficiency_d                  0.000000
deficiency_e                  0.000000
deficiency_f                  0.000000
deficiency_h                  0.000000
water_source                  0.000000
sanitation_type              27.179081
roof_type                

In [12]:
# for i in categorical_cols:
#     print(i)
#     print(X_train[i].unique())
#     print("----------------------")

In [13]:
#dropping occupation, worker_type and sector
X_train.drop(['occupation', 'worker_type', 'sector','marital_status'], axis=1, inplace=True)
X_test.drop(['occupation', 'worker_type', 'sector','marital_status'], axis=1, inplace=True)

cat_df.drop(['occupation', 'worker_type', 'sector','marital_status'], axis=1, inplace=True)

In [14]:
# X_train['year_started_school'].dtype

In [15]:

# gender- nominal
# district - nominal
# relhead - ordinal/nominal
# slept away - nominal
# read_write - ordinal
# frequented school- ordinal
# education level attended - ordinal
# grade or year - ordinal
# educational level attending - ordinal
# school problems - ordinal (no value high)
# deficiencies - ordinal
# water_source- ordinal
# sanitation type - ordinal
# roof type - ordinal
# wall type - ordinal
# occupation - ordinal
# worker type - ordinal
# sector- nominal/ordinal
# marital status - nominal
# nominal columns will be one hot encoded
ohe_cols = ['gender', 'district', 'relhead', 'slept_away_3' ]
# ordinal columns will be numerically encoded
nume_cols = ['read_write', 'frequented_school', 'education_level_attended', 'grade_or_year', 'education_level_attending', 'school_problem_a', 'school_problem_b', 'school_problem_c', 'school_problem_d', 'school_problem_e', 'school_problem_f', 'deficiency_a', 'deficiency_b', 'deficiency_c', 'deficiency_d', 'deficiency_e', 'deficiency_f', 'deficiency_h', 'water_source', 'sanitation_type', 'roof_type', 'wall_type']


In [16]:
# X_train[nume_cols].isnull().mean()*100

In [17]:
# # impute school problems, year started school, sanitation type, grade or year, read_write and education level attending with knn imputer
# from sklearn.impute import KNNImputer
# imputer = KNNImputer(n_neighbors=2,add_indicator=True)
# knn_impute_cols = ['school_problem_a', 'school_problem_b', 'school_problem_c', 'school_problem_d', 'school_problem_e', 'school_problem_f', 'sanitation_type', 'grade_or_year', 'read_write','education_level_attending']


In [18]:
# # print columns that are in knn_impute_cols and also in nume_cols
# a = set(knn_impute_cols).intersection(nume_cols)

In [19]:
# for i in a:
#     print(i, X_train[i].isnull().mean()*100)

In [20]:
X_train_try = X_train.copy()
X_test_try = X_test.copy()
cat_df_try = cat_df.copy()

In [21]:
# for i in nume_cols:
#     print(i)

In [22]:
# #numerically encoding missing value columns in nume_cols
"""
education_level_attending
[nan 'Lower primary' 'Upper primary' 'Lower secondary' 'Upper secondary'
 'Technical (medium)' 'Adult education' 'Technical (basic)'
 'University education' 'Technical (elementar)' 'Teacher training']
"""

"""
education_level_attended
['No education' 'Lower primary' 'Upper primary' 'Upper secondary'
 'Lower secondary' 'Technical (medium)' 'Does not know' 'Adult education'
 'Technical (basic)' 'University education' 'Technical (elementar)'
 'Teacher training']
"""

# from sklearn.preprocessing import OrdinalEncoder
oe_read_write = OrdinalEncoder(categories=[["Don't know",'No','Yes']],handle_unknown='use_encoded_value',unknown_value=-1)

oe_school_problems = OrdinalEncoder(categories=[['No','Yes']],handle_unknown='use_encoded_value',unknown_value=-1)

oe_education_level_attended = OrdinalEncoder(categories=[[
    'No education',
    'Does not know',
    'Adult education',
    'Technical (elementar)',
    'Lower primary',
    'Upper primary',
    'Lower secondary',
    'Upper secondary',
    'Technical (medium)',
    'Teacher training',
    'University education'
]],handle_unknown='use_encoded_value',unknown_value=-1)

oe_sanitation_type = OrdinalEncoder(categories=[['Unimproved latrine','Improved traditional latrine','Improved latrine','Toilet with septic tank']],handle_unknown='use_encoded_value',unknown_value=-1)

oe_grade_or_year = OrdinalEncoder(categories=[["Don't know",'0','1','2','3','4','5','6','7','8','9','10','11','12']],handle_unknown='use_encoded_value',unknown_value=-1)

oe_deficiencies = OrdinalEncoder(categories=[['No','Yes']],handle_unknown='use_encoded_value',unknown_value=-1)

oe_water_source = OrdinalEncoder(categories=[[
  # Tier 1: Unsafe for drinking without treatment
  "Unprotected well",
  "River, lake, lagoon",
  "Water from unprotected springs",

  # Tier 2: Potentially unsafe but improvable with basic treatment
  "Well with manual pump",
  "Water fountain",
  "Rainwater in Africa",  # Note: Rainwater quality can vary greatly depending on location and collection methods. Consider treating before drinking.

  # Tier 3: Generally safe but may require basic filtration
  "Piped in the neighbor's dwelling",
  "Piped in the yard",

  # Tier 4: Safe for drinking
  "Piped inside the dwelling",
  "Water cistern (or mobile tank/truck)",
  "Bottled water",
  "Water from protected springs",  # High-quality protected springs can be considered safe.
  "Borehole",

]
],handle_unknown='use_encoded_value',unknown_value=-1)

oe_roof_type = OrdinalEncoder(categories=[['Other','Grass/stem/wood',  'Zinc sheets', 'Cement fibler sheets', 'Concrete', 'Roof tile']
],handle_unknown='use_encoded_value',unknown_value=-1)

oe_wall_type = OrdinalEncoder(categories=[['Other' ,'Sticks (paus maticados)','Bamboo/reed/palms', 'Adobe blocks', 'Brick blocks (blocos de tijolo)', 'Wood/zinc', 'Cement blocks']
],handle_unknown='use_encoded_value',unknown_value=-1)

oe_frequented_school = OrdinalEncoder(categories=[['No','Yes']],handle_unknown='use_encoded_value',unknown_value=-1)

oe_education_level_attending = OrdinalEncoder(categories=[[
    'Technical (elementar)',
    'Technical (basic)',
    'Lower primary',
    'Upper primary',
    'Lower secondary',
    'Upper secondary',
    'Technical (medium)',
    'Teacher training',
    'University education',
    'Adult education'
]],handle_unknown='use_encoded_value',unknown_value=-1)

In [23]:
X_train['read_write'] = oe_read_write.fit_transform(X_train['read_write'].values.reshape(-1,1))
X_test['read_write'] = oe_read_write.transform(X_test['read_write'].values.reshape(-1,1))
cat_df['read_write'] = oe_read_write.transform(cat_df['read_write'].values.reshape(-1,1))

In [24]:
school_problem_cols = ['school_problem_a', 'school_problem_b', 'school_problem_c', 'school_problem_d', 'school_problem_e', 'school_problem_f']
for i in school_problem_cols:
    X_train[i] = oe_school_problems.fit_transform(X_train[i].values.reshape(-1,1))
    X_test[i] = oe_school_problems.transform(X_test[i].values.reshape(-1,1))
    cat_df[i] = oe_school_problems.transform(cat_df[i].values.reshape(-1,1))

In [25]:
X_train['education_level_attending'] = oe_education_level_attending.fit_transform(X_train['education_level_attending'].values.reshape(-1,1))  
X_test['education_level_attending'] = oe_education_level_attending.transform(X_test['education_level_attending'].values.reshape(-1,1)) 
cat_df['education_level_attending'] = oe_education_level_attending.transform(cat_df['education_level_attending'].values.reshape(-1,1)) 

In [26]:
X_train['sanitation_type'] = oe_sanitation_type.fit_transform(X_train['sanitation_type'].values.reshape(-1,1))
X_test['sanitation_type'] = oe_sanitation_type.transform(X_test['sanitation_type'].values.reshape(-1,1))
cat_df['sanitation_type'] = oe_sanitation_type.transform(cat_df['sanitation_type'].values.reshape(-1,1))

In [27]:
X_train['grade_or_year'] = oe_grade_or_year.fit_transform(X_train['grade_or_year'].values.reshape(-1,1))
X_test['grade_or_year'] = oe_grade_or_year.transform(X_test['grade_or_year'].values.reshape(-1,1))
cat_df['grade_or_year'] = oe_grade_or_year.transform(cat_df['grade_or_year'].values.reshape(-1,1))

In [28]:
X_train['education_level_attended'] = oe_education_level_attended.fit_transform(X_train['education_level_attended'].values.reshape(-1,1))
X_test['education_level_attended'] = oe_education_level_attended.transform(X_test['education_level_attended'].values.reshape(-1,1))
cat_df['education_level_attended'] = oe_education_level_attended.transform(cat_df['education_level_attended'].values.reshape(-1,1))

In [29]:
X_train['wall_type'] = oe_wall_type.fit_transform(X_train['wall_type'].values.reshape(-1,1))
X_test['wall_type'] = oe_wall_type.transform(X_test['wall_type'].values.reshape(-1,1))
cat_df['wall_type'] = oe_wall_type.transform(cat_df['wall_type'].values.reshape(-1,1))

X_train['roof_type'] = oe_roof_type.fit_transform(X_train['roof_type'].values.reshape(-1,1))
X_test['roof_type'] = oe_roof_type.transform(X_test['roof_type'].values.reshape(-1,1))
cat_df['roof_type'] = oe_roof_type.transform(cat_df['roof_type'].values.reshape(-1,1))

X_train['water_source'] = oe_water_source.fit_transform(X_train['water_source'].values.reshape(-1,1))
X_test['water_source'] = oe_water_source.transform(X_test['water_source'].values.reshape(-1,1))
cat_df['water_source'] = oe_water_source.transform(cat_df['water_source'].values.reshape(-1,1))

deficiency_cols = ['deficiency_a', 'deficiency_b', 'deficiency_c', 'deficiency_d', 'deficiency_e', 'deficiency_f', 'deficiency_h']
for i in deficiency_cols:
    X_train[i] = oe_deficiencies.fit_transform(X_train[i].values.reshape(-1,1))
    X_test[i] = oe_deficiencies.transform(X_test[i].values.reshape(-1,1))
    cat_df[i] = oe_deficiencies.transform(cat_df[i].values.reshape(-1,1))

X_train['frequented_school'] = oe_frequented_school.fit_transform(X_train['frequented_school'].values.reshape(-1,1))
X_test['frequented_school'] = oe_frequented_school.transform(X_test['frequented_school'].values.reshape(-1,1))  
cat_df['frequented_school'] = oe_frequented_school.transform(cat_df['frequented_school'].values.reshape(-1,1))



In [30]:

# null_counts_per_row = X_train_try.isnull().sum(axis=1)

# # Filter rows with more than 4 null values
# rows_with_more_than_4_nulls = null_counts_per_row[null_counts_per_row > 4]

# # Get the row numbers
# row_numbers = rows_with_more_than_4_nulls.index

# row_numbers

In [31]:
# X_train[nume_cols]

In [32]:
# #display X_train_try rows that are in row_numbers
# X_train_try[nume_cols]

In [33]:
# X_train.isnull().mean()*100

In [34]:
# ohe_cols

In [35]:
X_train_new = X_train[nume_cols+ohe_cols].copy()
X_test_new = X_test[nume_cols+ohe_cols].copy()
cat_df_new = cat_df[nume_cols+ohe_cols].copy()

In [36]:
# X_train_new

In [37]:
# # generate value_counts for ohe_cols
# for i in ohe_cols:
#     print(i)
#     print(X_train[i].value_counts())
#     print("----------------------")

In [38]:
# X_train_new.shape

In [39]:
# threshold_district = 20
# threshold_relhead = 31
# counts_district_train = X_train['district'].value_counts()
# counts_district_test = X_test['district'].value_counts()
# counts_relhead_train = X_train['relhead'].value_counts()
# counts_relhead_test = X_test['relhead'].value_counts()
# repl_district_train = counts_district_train[counts_district_train <= threshold_district].index
# repl_district_test = counts_district_test[counts_district_test <= threshold_district].index
# repl_relhead_train = counts_relhead_train[counts_relhead_train <= threshold_relhead].index
# repl_relhead_test = counts_relhead_test[counts_relhead_test <= threshold_relhead].index

# X1_train=pd.get_dummies(X_train_new['district'].replace(repl_district_train, 'uncommon_district'))
# X1_test=pd.get_dummies(X_test_new['district'].replace(repl_district_test, 'uncommon_district'))
# X2_train= pd.get_dummies(X_train_new['relhead'].replace(repl_relhead_train, 'uncommon_relhead'))
# X2_test= pd.get_dummies(X_test_new['relhead'].replace(repl_relhead_test, 'uncommon_relhead'))



In [40]:
# X_test_new

In [41]:
#sklearn one hot encoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False,drop='first')
ohe.fit(X_train_new[ohe_cols])

In [42]:
X_train_new_ohe_cols = ohe.transform(X_train_new[ohe_cols])
X_test_new_ohe_cols = ohe.transform(X_test_new[ohe_cols])
cat_df_new_ohe_cols = ohe.transform(cat_df_new[ohe_cols])

In [43]:
X_train_new_t=np.hstack((X_train_new,X_train_new_ohe_cols))
X_train_new_t = pd.DataFrame(X_train_new_t, columns = X_train_new.columns.tolist()+ohe.get_feature_names_out().tolist())

X_test_new_t=np.hstack((X_test_new,X_test_new_ohe_cols))
X_test_new_t = pd.DataFrame(X_test_new_t, columns = X_test_new.columns.tolist()+ohe.get_feature_names_out().tolist())

cat_df_new_t=np.hstack((cat_df_new,cat_df_new_ohe_cols))
cat_df_new_t = pd.DataFrame(cat_df_new_t, columns = cat_df_new.columns.tolist()+ohe.get_feature_names_out().tolist())

In [44]:
X_train_new_t.drop(ohe_cols, axis=1, inplace=True)
X_test_new_t.drop(ohe_cols, axis=1, inplace=True)
cat_df_new_t.drop(ohe_cols, axis=1, inplace=True)

In [45]:
X_train_new_t.shape , X_test_new_t.shape , y_train.shape , y_test.shape, cat_df_new_t.shape

((3786, 174), (947, 174), (3786,), (947,), (4733, 174))

In [46]:
#drop frequented_schools,education_level_Attended and grade or year from X_train_new_t and X_test_new_t
X_train_new_t.drop(['frequented_school','education_level_attended','grade_or_year'], axis=1, inplace=True)
X_test_new_t.drop(['frequented_school','education_level_attended','grade_or_year'], axis=1, inplace=True)

cat_df_with_fselagoy = cat_df_new_t.copy()
cat_df_new_t.drop(['frequented_school','education_level_attended','grade_or_year'], axis=1, inplace=True)



In [47]:
#convert cat_df_new_t to csv
cat_df_new_t.to_csv('cat_df.csv', index=False)
cat_df_with_fselagoy.to_csv('cat_Df_fselagoy.csv', index=False)


In [48]:
lr = LogisticRegression(random_state=0)

In [49]:
lr.fit(X_train_new_t, y_train)

In [50]:
lr_pred = lr.predict(X_test_new_t)


In [51]:
y_test.value_counts()   

1    658
2    172
0    117
Name: count, dtype: int64

In [52]:
# generate accuracy scores and classifier report
print(accuracy_score(y_test, lr_pred))
print(classification_report(y_test, lr_pred))

0.9429778247096093
              precision    recall  f1-score   support

           0       0.91      0.60      0.72       117
           1       1.00      1.00      1.00       658
           2       0.78      0.96      0.86       172

    accuracy                           0.94       947
   macro avg       0.90      0.85      0.86       947
weighted avg       0.95      0.94      0.94       947



In [53]:
#cross validate
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, X_test_new_t, y_test, cv=5)
print(scores.mean())

0.9345530492898912


In [54]:
X_train_new_t['deficiency_h'].value_counts()

deficiency_h
0.0    3786
Name: count, dtype: int64

In [55]:
# convert X_train_new_t columns to a dictionary

X_train_new_t_cols = X_train_new_t.columns.tolist()
X_train_new_t_cols_dict = {}
for i in X_train_new_t_cols:
    X_train_new_t_cols_dict[i] = X_train_new_t[i].unique().tolist()

X_train_new_t_cols_dict

{'read_write': [1.0, 0.0, 2.0, -1.0],
 'education_level_attending': [-1.0,
  2.0,
  3.0,
  4.0,
  5.0,
  6.0,
  9.0,
  1.0,
  8.0,
  0.0,
  7.0],
 'school_problem_a': [-1.0, 0.0, 1.0],
 'school_problem_b': [-1.0, 0.0, 1.0],
 'school_problem_c': [-1.0, 0.0, 1.0],
 'school_problem_d': [-1.0, 0.0, 1.0],
 'school_problem_e': [-1.0, 0.0, 1.0],
 'school_problem_f': [-1.0, 1.0, 0.0],
 'deficiency_a': [1.0, 0.0],
 'deficiency_b': [0.0, 1.0],
 'deficiency_c': [0.0, 1.0],
 'deficiency_d': [0.0, 1.0],
 'deficiency_e': [0.0, 1.0],
 'deficiency_f': [0.0, 1.0],
 'deficiency_h': [0.0],
 'water_source': [1.0,
  3.0,
  7.0,
  -1.0,
  0.0,
  12.0,
  6.0,
  8.0,
  2.0,
  4.0,
  11.0,
  9.0,
  10.0],
 'sanitation_type': [0.0, -1.0, 1.0, 2.0, 3.0],
 'roof_type': [1.0, 2.0, 4.0, 3.0, 0.0, 5.0],
 'wall_type': [3.0, 6.0, 1.0, 0.0, 4.0, 5.0, 2.0],
 'gender_Male': [1.0, 0.0],
 'district_ANCUABE': [0.0, 1.0],
 'district_ANGOCHE': [0.0, 1.0],
 'district_ANGONIA': [0.0, 1.0],
 'district_BALAMA': [0.0, 1.0],
 'dist

In [56]:
#test other models as well
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
#test each model and print result
models = []
models.append(('LR', LogisticRegression()))
models.append(('RF', RandomForestClassifier()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC()))
models.append(('GB', GradientBoostingClassifier()))
models.append(('ADA', AdaBoostClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('BG', BaggingClassifier()))
models.append(('NB', GaussianNB()))

#in a loop one by one fit and test data
for name, model in models:
    model.fit(X_train_new_t, y_train)
    y_pred = model.predict(X_test_new_t)
    print(name)
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("----------------------")


LR
0.9429778247096093
              precision    recall  f1-score   support

           0       0.91      0.60      0.72       117
           1       1.00      1.00      1.00       658
           2       0.78      0.96      0.86       172

    accuracy                           0.94       947
   macro avg       0.90      0.85      0.86       947
weighted avg       0.95      0.94      0.94       947

----------------------
RF
0.9366420274551215
              precision    recall  f1-score   support

           0       0.83      0.62      0.71       117
           1       1.00      1.00      1.00       658
           2       0.78      0.91      0.84       172

    accuracy                           0.94       947
   macro avg       0.87      0.84      0.85       947
weighted avg       0.94      0.94      0.93       947

----------------------
KNN
0.9176346356916578
              precision    recall  f1-score   support

           0       0.74      0.51      0.61       117
           1    

In [57]:
# #check correlation of X_train_new_t with y_train
# X_train_new_t['y_train'] = y_train
# X_train_new_t.corr()['y_train'].sort_values(ascending=False)


# #create a correlation matrix and display all rows and columns
# corr_matrix = X_train_new_t.corr()
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# #display correlation with y_train
# corr_matrix['y_train'].sort_values(ascending=True)


In [58]:
#print that grade or year, education level attended and frequented school are highly correlated
print("grade_or_year, education_level_attended and frequented_school are highly correlated")

grade_or_year, education_level_attended and frequented_school are highly correlated


In [59]:
# apply sequential feature selector
# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# sfs = SFS(lr, 
#           k_features=100, 
#           forward=False, 
#           floating=False, 
#           scoring='accuracy',
#           cv=5,n_jobs=-1,
#           verbose=2)
# model_sfs = sfs.fit(X_train_new_t, y_train)


In [60]:
# model_sfs.k_feature_names_

In [None]:
# #get best model score
# model_sfs.k_score_


In [63]:
from shapash.explainer.smart_explainer import SmartExplainer
SE = SmartExplainer(model =lr)
SE.compile(
x=X_test_new_t,

)

INFO: Shap explainer type - shap.explainers.PermutationExplainer()


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''