In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statistics import mode as md

In [8]:
X_test = pd.read_csv("../tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("../tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("../tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [9]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [10]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

# Defining the train and test sets

In [11]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

In [45]:
X_train_sample = X_train.sample(n=1000, random_state=42)
y_train_sample = y_train.iloc[list(X_train_sample.index)]

# Base Model – Logistic Regression, No Regularization

In [54]:
logreg = LogisticRegression(max_iter = 1000, random_state = 42)

In [55]:
param_grid = {
    'penalty': ['None', 'l1', 'l2', 'elasticnet'], 'C': [0, 1, 5, 10, 100, 1000, 10000], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

gs_lr = GridSearchCV(logreg, param_grid, cv=3, verbose=10)
gs_lr.fit(X_train_sample, y_train_sample)

Fitting 3 folds for each of 168 candidates, totalling 504 fits
[CV 1/3; 1/168] START C=0, penalty=None, solver=lbfgs...........................
[CV 1/3; 1/168] END C=0, penalty=None, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 1/168] START C=0, penalty=None, solver=lbfgs...........................
[CV 2/3; 1/168] END C=0, penalty=None, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 1/168] START C=0, penalty=None, solver=lbfgs...........................
[CV 3/3; 1/168] END C=0, penalty=None, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 2/168] START C=0, penalty=None, solver=liblinear.......................
[CV 1/3; 2/168] END C=0, penalty=None, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 2/168] START C=0, penalty=None, solver=liblinear.......................
[CV 2/3; 2/168] END C=0, penalty=None, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 2/168] START C=0, penalty=None, solver=liblinear.......................
[CV 3/3; 2/168] END C=0, pen

[CV 2/3; 32/168] END C=1, penalty=l1, solver=liblinear;, score=0.682 total time=   0.2s
[CV 3/3; 32/168] START C=1, penalty=l1, solver=liblinear........................
[CV 3/3; 32/168] END C=1, penalty=l1, solver=liblinear;, score=0.667 total time=   0.2s
[CV 1/3; 33/168] START C=1, penalty=l1, solver=newton-cg........................
[CV 1/3; 33/168] END C=1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3; 33/168] START C=1, penalty=l1, solver=newton-cg........................
[CV 2/3; 33/168] END C=1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/3; 33/168] START C=1, penalty=l1, solver=newton-cg........................
[CV 3/3; 33/168] END C=1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/3; 34/168] START C=1, penalty=l1, solver=newton-cholesky..................
[CV 1/3; 34/168] END C=1, penalty=l1, solver=newton-cholesky;, score=nan total time=   0.0s
[CV 2/3; 34/168] START C=1, penalty=l1, solver=newton-cholesky.......



[CV 1/3; 36/168] END C=1, penalty=l1, solver=saga;, score=0.668 total time=   2.2s
[CV 2/3; 36/168] START C=1, penalty=l1, solver=saga.............................




[CV 2/3; 36/168] END C=1, penalty=l1, solver=saga;, score=0.682 total time=   2.2s
[CV 3/3; 36/168] START C=1, penalty=l1, solver=saga.............................




[CV 3/3; 36/168] END C=1, penalty=l1, solver=saga;, score=0.658 total time=   2.2s
[CV 1/3; 37/168] START C=1, penalty=l2, solver=lbfgs............................
[CV 1/3; 37/168] END C=1, penalty=l2, solver=lbfgs;, score=0.653 total time=   0.1s
[CV 2/3; 37/168] START C=1, penalty=l2, solver=lbfgs............................
[CV 2/3; 37/168] END C=1, penalty=l2, solver=lbfgs;, score=0.658 total time=   0.1s
[CV 3/3; 37/168] START C=1, penalty=l2, solver=lbfgs............................
[CV 3/3; 37/168] END C=1, penalty=l2, solver=lbfgs;, score=0.649 total time=   0.1s
[CV 1/3; 38/168] START C=1, penalty=l2, solver=liblinear........................
[CV 1/3; 38/168] END C=1, penalty=l2, solver=liblinear;, score=0.677 total time=   0.2s
[CV 2/3; 38/168] START C=1, penalty=l2, solver=liblinear........................
[CV 2/3; 38/168] END C=1, penalty=l2, solver=liblinear;, score=0.631 total time=   0.1s
[CV 3/3; 38/168] START C=1, penalty=l2, solver=liblinear........................
[CV



[CV 1/3; 41/168] END C=1, penalty=l2, solver=sag;, score=0.653 total time=   1.5s
[CV 2/3; 41/168] START C=1, penalty=l2, solver=sag..............................
[CV 2/3; 41/168] END C=1, penalty=l2, solver=sag;, score=0.646 total time=   1.0s
[CV 3/3; 41/168] START C=1, penalty=l2, solver=sag..............................




[CV 3/3; 41/168] END C=1, penalty=l2, solver=sag;, score=0.655 total time=   1.4s
[CV 1/3; 42/168] START C=1, penalty=l2, solver=saga.............................




[CV 1/3; 42/168] END C=1, penalty=l2, solver=saga;, score=0.656 total time=   1.6s
[CV 2/3; 42/168] START C=1, penalty=l2, solver=saga.............................




[CV 2/3; 42/168] END C=1, penalty=l2, solver=saga;, score=0.643 total time=   1.6s
[CV 3/3; 42/168] START C=1, penalty=l2, solver=saga.............................




[CV 3/3; 42/168] END C=1, penalty=l2, solver=saga;, score=0.649 total time=   1.6s
[CV 1/3; 43/168] START C=1, penalty=elasticnet, solver=lbfgs....................
[CV 1/3; 43/168] END C=1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 43/168] START C=1, penalty=elasticnet, solver=lbfgs....................
[CV 2/3; 43/168] END C=1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 43/168] START C=1, penalty=elasticnet, solver=lbfgs....................
[CV 3/3; 43/168] END C=1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 44/168] START C=1, penalty=elasticnet, solver=liblinear................
[CV 1/3; 44/168] END C=1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 44/168] START C=1, penalty=elasticnet, solver=liblinear................
[CV 2/3; 44/168] END C=1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 44/168] START C=1, penalty=elasticnet, solver



[CV 1/3; 60/168] END C=5, penalty=l1, solver=saga;, score=0.656 total time=   2.3s
[CV 2/3; 60/168] START C=5, penalty=l1, solver=saga.............................




[CV 2/3; 60/168] END C=5, penalty=l1, solver=saga;, score=0.649 total time=   2.3s
[CV 3/3; 60/168] START C=5, penalty=l1, solver=saga.............................




[CV 3/3; 60/168] END C=5, penalty=l1, solver=saga;, score=0.655 total time=   2.3s
[CV 1/3; 61/168] START C=5, penalty=l2, solver=lbfgs............................
[CV 1/3; 61/168] END C=5, penalty=l2, solver=lbfgs;, score=0.659 total time=   0.1s
[CV 2/3; 61/168] START C=5, penalty=l2, solver=lbfgs............................
[CV 2/3; 61/168] END C=5, penalty=l2, solver=lbfgs;, score=0.646 total time=   0.2s
[CV 3/3; 61/168] START C=5, penalty=l2, solver=lbfgs............................
[CV 3/3; 61/168] END C=5, penalty=l2, solver=lbfgs;, score=0.652 total time=   0.2s
[CV 1/3; 62/168] START C=5, penalty=l2, solver=liblinear........................
[CV 1/3; 62/168] END C=5, penalty=l2, solver=liblinear;, score=0.659 total time=   0.3s
[CV 2/3; 62/168] START C=5, penalty=l2, solver=liblinear........................
[CV 2/3; 62/168] END C=5, penalty=l2, solver=liblinear;, score=0.637 total time=   0.2s
[CV 3/3; 62/168] START C=5, penalty=l2, solver=liblinear........................
[CV



[CV 1/3; 65/168] END C=5, penalty=l2, solver=sag;, score=0.641 total time=   1.5s
[CV 2/3; 65/168] START C=5, penalty=l2, solver=sag..............................




[CV 2/3; 65/168] END C=5, penalty=l2, solver=sag;, score=0.634 total time=   1.4s
[CV 3/3; 65/168] START C=5, penalty=l2, solver=sag..............................




[CV 3/3; 65/168] END C=5, penalty=l2, solver=sag;, score=0.646 total time=   1.4s
[CV 1/3; 66/168] START C=5, penalty=l2, solver=saga.............................




[CV 1/3; 66/168] END C=5, penalty=l2, solver=saga;, score=0.647 total time=   1.6s
[CV 2/3; 66/168] START C=5, penalty=l2, solver=saga.............................




[CV 2/3; 66/168] END C=5, penalty=l2, solver=saga;, score=0.631 total time=   1.6s
[CV 3/3; 66/168] START C=5, penalty=l2, solver=saga.............................




[CV 3/3; 66/168] END C=5, penalty=l2, solver=saga;, score=0.643 total time=   1.6s
[CV 1/3; 67/168] START C=5, penalty=elasticnet, solver=lbfgs....................
[CV 1/3; 67/168] END C=5, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 67/168] START C=5, penalty=elasticnet, solver=lbfgs....................
[CV 2/3; 67/168] END C=5, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 67/168] START C=5, penalty=elasticnet, solver=lbfgs....................
[CV 3/3; 67/168] END C=5, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 68/168] START C=5, penalty=elasticnet, solver=liblinear................
[CV 1/3; 68/168] END C=5, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 68/168] START C=5, penalty=elasticnet, solver=liblinear................
[CV 2/3; 68/168] END C=5, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 68/168] START C=5, penalty=elasticnet, solver



[CV 1/3; 84/168] END C=10, penalty=l1, solver=saga;, score=0.653 total time=   2.3s
[CV 2/3; 84/168] START C=10, penalty=l1, solver=saga............................




[CV 2/3; 84/168] END C=10, penalty=l1, solver=saga;, score=0.640 total time=   2.3s
[CV 3/3; 84/168] START C=10, penalty=l1, solver=saga............................




[CV 3/3; 84/168] END C=10, penalty=l1, solver=saga;, score=0.646 total time=   2.3s
[CV 1/3; 85/168] START C=10, penalty=l2, solver=lbfgs...........................
[CV 1/3; 85/168] END C=10, penalty=l2, solver=lbfgs;, score=0.659 total time=   0.1s
[CV 2/3; 85/168] START C=10, penalty=l2, solver=lbfgs...........................
[CV 2/3; 85/168] END C=10, penalty=l2, solver=lbfgs;, score=0.634 total time=   0.1s
[CV 3/3; 85/168] START C=10, penalty=l2, solver=lbfgs...........................
[CV 3/3; 85/168] END C=10, penalty=l2, solver=lbfgs;, score=0.649 total time=   0.1s
[CV 1/3; 86/168] START C=10, penalty=l2, solver=liblinear.......................
[CV 1/3; 86/168] END C=10, penalty=l2, solver=liblinear;, score=0.653 total time=   0.3s
[CV 2/3; 86/168] START C=10, penalty=l2, solver=liblinear.......................
[CV 2/3; 86/168] END C=10, penalty=l2, solver=liblinear;, score=0.637 total time=   0.3s
[CV 3/3; 86/168] START C=10, penalty=l2, solver=liblinear.....................



[CV 1/3; 89/168] END C=10, penalty=l2, solver=sag;, score=0.638 total time=   1.4s
[CV 2/3; 89/168] START C=10, penalty=l2, solver=sag.............................




[CV 2/3; 89/168] END C=10, penalty=l2, solver=sag;, score=0.622 total time=   1.4s
[CV 3/3; 89/168] START C=10, penalty=l2, solver=sag.............................




[CV 3/3; 89/168] END C=10, penalty=l2, solver=sag;, score=0.643 total time=   1.4s
[CV 1/3; 90/168] START C=10, penalty=l2, solver=saga............................




[CV 1/3; 90/168] END C=10, penalty=l2, solver=saga;, score=0.647 total time=   1.6s
[CV 2/3; 90/168] START C=10, penalty=l2, solver=saga............................




[CV 2/3; 90/168] END C=10, penalty=l2, solver=saga;, score=0.628 total time=   1.6s
[CV 3/3; 90/168] START C=10, penalty=l2, solver=saga............................




[CV 3/3; 90/168] END C=10, penalty=l2, solver=saga;, score=0.640 total time=   1.6s
[CV 1/3; 91/168] START C=10, penalty=elasticnet, solver=lbfgs...................
[CV 1/3; 91/168] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 91/168] START C=10, penalty=elasticnet, solver=lbfgs...................
[CV 2/3; 91/168] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 91/168] START C=10, penalty=elasticnet, solver=lbfgs...................
[CV 3/3; 91/168] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 92/168] START C=10, penalty=elasticnet, solver=liblinear...............
[CV 1/3; 92/168] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 92/168] START C=10, penalty=elasticnet, solver=liblinear...............
[CV 2/3; 92/168] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 92/168] START C=10, penalty=elasticnet,



[CV 1/3; 108/168] END C=100, penalty=l1, solver=saga;, score=0.644 total time=   2.4s
[CV 2/3; 108/168] START C=100, penalty=l1, solver=saga..........................




[CV 2/3; 108/168] END C=100, penalty=l1, solver=saga;, score=0.628 total time=   2.5s
[CV 3/3; 108/168] START C=100, penalty=l1, solver=saga..........................




[CV 3/3; 108/168] END C=100, penalty=l1, solver=saga;, score=0.640 total time=   2.4s
[CV 1/3; 109/168] START C=100, penalty=l2, solver=lbfgs.........................
[CV 1/3; 109/168] END C=100, penalty=l2, solver=lbfgs;, score=0.644 total time=   0.3s
[CV 2/3; 109/168] START C=100, penalty=l2, solver=lbfgs.........................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3; 109/168] END C=100, penalty=l2, solver=lbfgs;, score=0.625 total time=   0.3s
[CV 3/3; 109/168] START C=100, penalty=l2, solver=lbfgs.........................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3; 109/168] END C=100, penalty=l2, solver=lbfgs;, score=0.640 total time=   0.3s
[CV 1/3; 110/168] START C=100, penalty=l2, solver=liblinear.....................
[CV 1/3; 110/168] END C=100, penalty=l2, solver=liblinear;, score=0.644 total time=   0.4s
[CV 2/3; 110/168] START C=100, penalty=l2, solver=liblinear.....................
[CV 2/3; 110/168] END C=100, penalty=l2, solver=liblinear;, score=0.631 total time=   0.6s
[CV 3/3; 110/168] START C=100, penalty=l2, solver=liblinear.....................
[CV 3/3; 110/168] END C=100, penalty=l2, solver=liblinear;, score=0.637 total time=   0.3s
[CV 1/3; 111/168] START C=100, penalty=l2, solver=newton-cg.....................
[CV 1/3; 111/168] END C=100, penalty=l2, solver=newton-cg;, score=0.647 total time=   0.3s
[CV 2/3; 111/168] START C=100, penalty=l2, solver=newton-cg.....................
[CV 2/3; 111/168] END C=100, penalty=l2, solver=newton-cg;, score=0.619 total time=   0.5s
[CV 3/3; 111/168] START C=100, penalty=l2, solver=new



[CV 1/3; 113/168] END C=100, penalty=l2, solver=sag;, score=0.635 total time=   1.5s
[CV 2/3; 113/168] START C=100, penalty=l2, solver=sag...........................




[CV 2/3; 113/168] END C=100, penalty=l2, solver=sag;, score=0.622 total time=   1.4s
[CV 3/3; 113/168] START C=100, penalty=l2, solver=sag...........................




[CV 3/3; 113/168] END C=100, penalty=l2, solver=sag;, score=0.643 total time=   1.4s
[CV 1/3; 114/168] START C=100, penalty=l2, solver=saga..........................




[CV 1/3; 114/168] END C=100, penalty=l2, solver=saga;, score=0.644 total time=   1.6s
[CV 2/3; 114/168] START C=100, penalty=l2, solver=saga..........................




[CV 2/3; 114/168] END C=100, penalty=l2, solver=saga;, score=0.628 total time=   1.6s
[CV 3/3; 114/168] START C=100, penalty=l2, solver=saga..........................




[CV 3/3; 114/168] END C=100, penalty=l2, solver=saga;, score=0.640 total time=   1.6s
[CV 1/3; 115/168] START C=100, penalty=elasticnet, solver=lbfgs.................
[CV 1/3; 115/168] END C=100, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 115/168] START C=100, penalty=elasticnet, solver=lbfgs.................
[CV 2/3; 115/168] END C=100, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 115/168] START C=100, penalty=elasticnet, solver=lbfgs.................
[CV 3/3; 115/168] END C=100, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 116/168] START C=100, penalty=elasticnet, solver=liblinear.............
[CV 1/3; 116/168] END C=100, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 116/168] START C=100, penalty=elasticnet, solver=liblinear.............
[CV 2/3; 116/168] END C=100, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 116/168] START C=100, penal



[CV 1/3; 132/168] END C=1000, penalty=l1, solver=saga;, score=0.644 total time=   2.3s
[CV 2/3; 132/168] START C=1000, penalty=l1, solver=saga.........................




[CV 2/3; 132/168] END C=1000, penalty=l1, solver=saga;, score=0.628 total time=   2.3s
[CV 3/3; 132/168] START C=1000, penalty=l1, solver=saga.........................




[CV 3/3; 132/168] END C=1000, penalty=l1, solver=saga;, score=0.640 total time=   2.2s
[CV 1/3; 133/168] START C=1000, penalty=l2, solver=lbfgs........................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3; 133/168] END C=1000, penalty=l2, solver=lbfgs;, score=0.644 total time=   0.4s
[CV 2/3; 133/168] START C=1000, penalty=l2, solver=lbfgs........................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3; 133/168] END C=1000, penalty=l2, solver=lbfgs;, score=0.628 total time=   0.4s
[CV 3/3; 133/168] START C=1000, penalty=l2, solver=lbfgs........................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3; 133/168] END C=1000, penalty=l2, solver=lbfgs;, score=0.640 total time=   0.5s
[CV 1/3; 134/168] START C=1000, penalty=l2, solver=liblinear....................
[CV 1/3; 134/168] END C=1000, penalty=l2, solver=liblinear;, score=0.644 total time=   0.6s
[CV 2/3; 134/168] START C=1000, penalty=l2, solver=liblinear....................
[CV 2/3; 134/168] END C=1000, penalty=l2, solver=liblinear;, score=0.607 total time=   1.3s
[CV 3/3; 134/168] START C=1000, penalty=l2, solver=liblinear....................
[CV 3/3; 134/168] END C=1000, penalty=l2, solver=liblinear;, score=0.622 total time=   0.5s
[CV 1/3; 135/168] START C=1000, penalty=l2, solver=newton-cg....................
[CV 1/3; 135/168] END C=1000, penalty=l2, solver=newton-cg;, score=0.644 total time=   1.2s
[CV 2/3; 135/168] START C=1000, penalty=l2, solver=newton-cg....................
[CV 2/3; 135/168] END C=1000, penalty=l2, solver=newton-cg;, score=0.619 total time=   2.1s
[CV 3/3; 135/168] START C=1000, penalty=l2, sol



[CV 1/3; 137/168] END C=1000, penalty=l2, solver=sag;, score=0.635 total time=   1.5s
[CV 2/3; 137/168] START C=1000, penalty=l2, solver=sag..........................




[CV 2/3; 137/168] END C=1000, penalty=l2, solver=sag;, score=0.622 total time=   1.4s
[CV 3/3; 137/168] START C=1000, penalty=l2, solver=sag..........................




[CV 3/3; 137/168] END C=1000, penalty=l2, solver=sag;, score=0.640 total time=   1.4s
[CV 1/3; 138/168] START C=1000, penalty=l2, solver=saga.........................




[CV 1/3; 138/168] END C=1000, penalty=l2, solver=saga;, score=0.644 total time=   1.6s
[CV 2/3; 138/168] START C=1000, penalty=l2, solver=saga.........................




[CV 2/3; 138/168] END C=1000, penalty=l2, solver=saga;, score=0.628 total time=   1.6s
[CV 3/3; 138/168] START C=1000, penalty=l2, solver=saga.........................




[CV 3/3; 138/168] END C=1000, penalty=l2, solver=saga;, score=0.640 total time=   1.6s
[CV 1/3; 139/168] START C=1000, penalty=elasticnet, solver=lbfgs................
[CV 1/3; 139/168] END C=1000, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 139/168] START C=1000, penalty=elasticnet, solver=lbfgs................
[CV 2/3; 139/168] END C=1000, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 139/168] START C=1000, penalty=elasticnet, solver=lbfgs................
[CV 3/3; 139/168] END C=1000, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 140/168] START C=1000, penalty=elasticnet, solver=liblinear............
[CV 1/3; 140/168] END C=1000, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 140/168] START C=1000, penalty=elasticnet, solver=liblinear............
[CV 2/3; 140/168] END C=1000, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 140/168] START C=1000



[CV 2/3; 152/168] END C=10000, penalty=l1, solver=liblinear;, score=0.598 total time= 3.7min
[CV 3/3; 152/168] START C=10000, penalty=l1, solver=liblinear...................
[CV 3/3; 152/168] END C=10000, penalty=l1, solver=liblinear;, score=0.613 total time=   4.7s
[CV 1/3; 153/168] START C=10000, penalty=l1, solver=newton-cg...................
[CV 1/3; 153/168] END C=10000, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3; 153/168] START C=10000, penalty=l1, solver=newton-cg...................
[CV 2/3; 153/168] END C=10000, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/3; 153/168] START C=10000, penalty=l1, solver=newton-cg...................
[CV 3/3; 153/168] END C=10000, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/3; 154/168] START C=10000, penalty=l1, solver=newton-cholesky.............
[CV 1/3; 154/168] END C=10000, penalty=l1, solver=newton-cholesky;, score=nan total time=   0.0s
[CV 2/3; 154/168] START C=10000, penalt



[CV 1/3; 156/168] END C=10000, penalty=l1, solver=saga;, score=0.644 total time=   2.2s
[CV 2/3; 156/168] START C=10000, penalty=l1, solver=saga........................




[CV 2/3; 156/168] END C=10000, penalty=l1, solver=saga;, score=0.628 total time=   2.2s
[CV 3/3; 156/168] START C=10000, penalty=l1, solver=saga........................




[CV 3/3; 156/168] END C=10000, penalty=l1, solver=saga;, score=0.640 total time=   2.2s
[CV 1/3; 157/168] START C=10000, penalty=l2, solver=lbfgs.......................
[CV 1/3; 157/168] END C=10000, penalty=l2, solver=lbfgs;, score=0.632 total time=   0.2s
[CV 2/3; 157/168] START C=10000, penalty=l2, solver=lbfgs.......................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3; 157/168] END C=10000, penalty=l2, solver=lbfgs;, score=0.598 total time=   0.3s
[CV 3/3; 157/168] START C=10000, penalty=l2, solver=lbfgs.......................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3; 157/168] END C=10000, penalty=l2, solver=lbfgs;, score=0.616 total time=   0.4s
[CV 1/3; 158/168] START C=10000, penalty=l2, solver=liblinear...................
[CV 1/3; 158/168] END C=10000, penalty=l2, solver=liblinear;, score=0.641 total time=   0.6s
[CV 2/3; 158/168] START C=10000, penalty=l2, solver=liblinear...................
[CV 2/3; 158/168] END C=10000, penalty=l2, solver=liblinear;, score=0.604 total time=   1.2s
[CV 3/3; 158/168] START C=10000, penalty=l2, solver=liblinear...................
[CV 3/3; 158/168] END C=10000, penalty=l2, solver=liblinear;, score=0.619 total time=   0.5s
[CV 1/3; 159/168] START C=10000, penalty=l2, solver=newton-cg...................
[CV 1/3; 159/168] END C=10000, penalty=l2, solver=newton-cg;, score=0.644 total time=   2.2s
[CV 2/3; 159/168] START C=10000, penalty=l2, solver=newton-cg...................
[CV 2/3; 159/168] END C=10000, penalty=l2, solver=newton-cg;, score=0.616 total time=   5.6s
[CV 3/3; 159/168] START C=10000, penalty=



[CV 1/3; 161/168] END C=10000, penalty=l2, solver=sag;, score=0.635 total time=   1.5s
[CV 2/3; 161/168] START C=10000, penalty=l2, solver=sag.........................




[CV 2/3; 161/168] END C=10000, penalty=l2, solver=sag;, score=0.622 total time=   1.4s
[CV 3/3; 161/168] START C=10000, penalty=l2, solver=sag.........................




[CV 3/3; 161/168] END C=10000, penalty=l2, solver=sag;, score=0.640 total time=   1.4s
[CV 1/3; 162/168] START C=10000, penalty=l2, solver=saga........................




[CV 1/3; 162/168] END C=10000, penalty=l2, solver=saga;, score=0.644 total time=   1.6s
[CV 2/3; 162/168] START C=10000, penalty=l2, solver=saga........................




[CV 2/3; 162/168] END C=10000, penalty=l2, solver=saga;, score=0.628 total time=   1.6s
[CV 3/3; 162/168] START C=10000, penalty=l2, solver=saga........................


360 fits failed out of a total of 504.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/angeloturri/opt/anaconda3/envs/learn-env/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/angeloturri/opt/anaconda3/envs/learn-env/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Users/angeloturri/opt/anaconda3/envs/learn-env/lib/python3.11/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/Users/angeloturri/opt/anaconda3/env

[CV 3/3; 162/168] END C=10000, penalty=l2, solver=saga;, score=0.640 total time=   1.6s
[CV 1/3; 163/168] START C=10000, penalty=elasticnet, solver=lbfgs...............
[CV 1/3; 163/168] END C=10000, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3; 163/168] START C=10000, penalty=elasticnet, solver=lbfgs...............
[CV 2/3; 163/168] END C=10000, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3; 163/168] START C=10000, penalty=elasticnet, solver=lbfgs...............
[CV 3/3; 163/168] END C=10000, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3; 164/168] START C=10000, penalty=elasticnet, solver=liblinear...........
[CV 1/3; 164/168] END C=10000, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3; 164/168] START C=10000, penalty=elasticnet, solver=liblinear...........
[CV 2/3; 164/168] END C=10000, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3; 164/168] START 

In [56]:
gs_lr.best_params_

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}

In [49]:
logreg = LogisticRegression(C=1, penalty='l1', solver='saga')
logreg.fit(X_train, y_train)



In [50]:
preds = logreg.predict(X_test)

In [51]:
report_columns = ['functional', 'functional needs repair', 
                  'non functional', 'accuracy', 'macro avg', 
                  'weighted avg']

report_rows = ['precision', 'recall', 
               'f1-score', 'support']

report = pd.DataFrame(classification_report(y_test, preds, output_dict=True), columns=report_columns, index=report_rows)
                      
matrix = pd.DataFrame(confusion_matrix(y_test, preds))

In [52]:
report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.732003,0.506276,0.776827,0.742828,0.671702,0.733057
recall,0.884582,0.114583,0.656608,0.742828,0.551925,0.742828
f1-score,0.801092,0.186873,0.711676,0.742828,0.566547,0.723292
support,8127.0,1056.0,5667.0,0.742828,14850.0,14850.0
