# Ideas

Have histograms distributions for the other variables within each variable section.

Could lump together needs repair/non functional

Have variables like "installer = funder" and such. Those variables seem to be very similar.

Number of functional wells over the years, non functional wells over the years, etc.

Use SMOTE for data with no null values, all known, and no one-time value variables.

Train models with numerical status group variabel and ones with categorical status group variable.

Better evaluate which variables you should include in the model. (correlation, etc.)

Find a way of evaluating the success of each model graphically, and in a more detailed fashion.

Find ways of dissecting how well each model predicts nf, fnr, and f categories.

Write functions to make this entire notebook more organized.

SMOTE on functional needs repair data

Look for patterns in what each individual model says for FNR data points.

Parallel notebooks for demanding models. Hyperparameter tuning, etc.

Make your own X-test and y-test training set.

Find any differences between kaggle's X-test and your own, like extra categories and whatnot.

Consider again creating extra features.

Test out different groups of features.

Do cross validation and synthetic over-sampling at the same time.

Try different degrees of over-sampling.

Three specialized models and two balanced models.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, get_scorer_names, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [98]:
def cval(X, y, X_resampled, y_resampled, cval, estimator):
    
    reports = []
    matrices = []
    numpy_reports = []
    numpy_matrices = []
    
    report_columns = ['functional', 'functional needs repair', 
                      'non functional', 'accuracy', 'macro avg', 
                      'weighted avg']
    
#     report_rows = ['precision', 'recall', 
#                    'f1-score', 'support', 'accuracy']
    
    report_rows = ['precision', 'recall', 
                   'f1-score', 'support']
    
    matrix_labels = ['functional', 'functional needs repair', 
                     'non functional']
    
    idx = list(X.index)
    np.random.shuffle(idx)
    
    for i in list(range(cval)):
        arrs = np.array_split(idx, 10)
        
        test = arrs.pop(i)
        train = np.concatenate(arrs)
        
        test_x = X.take(test)
        train_x = X.take(train)
        test_y = y.take(test)
        train_y = y.take(train)
        
        model = estimator
        model.fit(train_x, train_y)
        preds = model.predict(test_x)
        
#         adj = pd.concat([test_y.reset_index(drop=True), pd.DataFrame(preds, columns=['preds'])], axis=1)
        
#         f = len(adj.query("status_group == 'functional'"))
#         fnr = len(adj.query("status_group == 'functional needs repair'"))
#         nf = len(adj.query("status_group == 'non functional'"))
        
#         f_acc = len(adj.query("status_group == 'functional' & status_group == preds")) / f
#         fnr_acc = len(adj.query("status_group == 'functional needs repair' & status_group == preds")) / fnr
#         nf_acc = len(adj.query("status_group == 'non functional' & status_group == preds")) / nf
        
        report = pd.DataFrame(classification_report(test_y, preds, output_dict=True))
        
#         report = pd.concat([report, pd.DataFrame({'functional': [f_acc], 
#                                                   'functional needs repair': [fnr_acc], 
#                                                   'non functional': [nf_acc]}, 
#                                                  index=['accuracy'])
#                         ])
        
        reports.append(report)
        numpy_reports.append(np.array(report))
        
        matrix = pd.DataFrame(confusion_matrix(test_y, preds))
        matrices.append(matrix)
        numpy_matrices.append(np.array(matrix))
    
    numpy_report = pd.DataFrame(np.sum(numpy_reports, axis=0)/cval, 
                                columns=report_columns, index=report_rows)
    
    numpy_matrix = pd.DataFrame(np.sum(numpy_matrices, axis=0)/cval, 
                                columns=matrix_labels, index=matrix_labels)
    
    estimator.fit(X_resampled, y_resampled)
    resampled_preds = estimator.predict(X)
    resampled_report = pd.DataFrame(classification_report(y, resampled_preds, output_dict=True), 
                                    columns=report_columns, index=report_rows)
    resampled_matrix = pd.DataFrame(confusion_matrix(y, resampled_preds), columns=matrix_labels, index=matrix_labels)
    
    return reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix

In [3]:
X_test = pd.read_csv("tanzanian_water_wells/X_test.csv")
X_train = pd.read_csv("tanzanian_water_wells/X_train.csv")
y_train = pd.read_csv("tanzanian_water_wells/y_train.csv")

df = pd.concat([X_train, y_train], axis=1)

In [4]:
desc = {'amount_tsh': 'Total static head (amount water available to waterpoint)',
                    'date_recorded': 'The date the row was entered',
                    'funder': 'Who funded the well',
                    'gps_height': 'Altitude of the well',
                    'installer': 'Organization that installed the well',
                    'longitude': 'GPS coordinate',
                    'latitude': 'GPS coordinate',
                    'wpt_name': 'Name of the waterpoint if there is one',
                    'subvillage': 'Geographic location',
                    'region': 'Geographic location',
                    'region_code': 'Geographic location (coded)',
                    'district_code': 'Geographic location (coded)',
                    'lga': 'Geographic location',
                    'ward': 'Geographic location',
                    'population': 'Population around the well',
                    'public_meeting': 'True/False',
                    'recorded_by': 'Group entering this row of data',
                    'scheme_management': 'Who operates the waterpoint',
                    'scheme_name': 'Who operates the waterpoint',
                    'permit': 'If the waterpoint is permitted',
                    'construction_year': 'Year the waterpoint was constructed',
                    'extraction_type': 'The kind of extraction the waterpoint uses',
                    'extraction_type_group': 'The kind of extraction the waterpoint uses',
                    'extraction_type_class': 'The kind of extraction the waterpoint uses',
                    'management': 'How the waterpoint is managed',
                    'management_group': 'How the waterpoint is managed',
                    'payment': 'What the water costs',
                    'payment_type': 'What the water costs',
                    'water_quality': 'The quality of the water',
                    'quality_group': 'The quality of the water',
                    'quantity': 'The quantity of water',
                    'quantity_group': 'The quantity of water',
                    'source': 'The source of the water',
                    'source_type': 'The source of the water',
                    'source_class': 'The source of the water',
                    'waterpoint_type': 'The kind of waterpoint',
                    'waterpoint_type_group': 'The kind of waterpoint'}

In [5]:
# Eliminating null values

df.funder.fillna("Unknown", inplace=True)
df.installer.fillna("Unknown", inplace=True)
df.scheme_management.fillna("None", inplace=True)
df.permit.fillna('Unknown', inplace=True)
df.scheme_name.fillna('Unknown', inplace=True)
df.subvillage.fillna('Unknown', inplace=True)
df.public_meeting.fillna('Unknown', inplace=True)

In [6]:
# df['fundernum'] = df['funder'].map(df.funder.value_counts())

# df['funder_installer'] = df['funder'] == df['installer']
# df['funder_installer'] = df['funder_installer'].astype('int')

# df['permit'] = df['permit'].map({True: 1, False: 0, 'Unknown': 2})

# df['status_id'] = df['status_group'].map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

# Defining the train and test sets

In [7]:
X = df.copy()

columns = ['amount_tsh', 'gps_height', 'population', 'region', 'lga', 
           'scheme_management', 'permit', 'construction_year',
           'extraction_type_group', 'payment', 'management', 
           'quality_group', 'quantity', 'source', 'waterpoint_type']

X = X[columns]

# X['public_meeting'] = X['public_meeting'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['permit'] = X['permit'].map({True: 'Yes', False: 'No', 'Unknown': 'Unknown'})
X['gps_height'] = X['gps_height'].astype('float64')
# X['district_code'] = X['district_code'].astype('float64')
X['population'] = X['population'].astype('float64')
# X['district_code'] = X['district_code'].astype('object')

X_cat = X.drop(list(X.select_dtypes(['float64']).columns), axis=1)
X_numeric = X[list(X.select_dtypes(['float64']).columns)]

y = df['status_group']

X_cat = pd.get_dummies(X_cat)

X = pd.concat([X_numeric, X_cat], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),
                index = X_train.index,
                columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test),
                index = X_test.index,
                columns = X_test.columns)

X_train.reset_index(inplace=True, drop=True)
y_train = y_train.reset_index(drop=True)

In [8]:
strategy = {'functional needs repair': 10000}
smote = SMOTE(sampling_strategy=strategy)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Base Model – Logistic Regression, No Regularization

In [28]:
estimator = LogisticRegression(solver='liblinear', fit_intercept=False)
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, estimator)

In [29]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,1897.0,300.8,213.8
functional needs repair,129.4,162.2,40.6
non functional,450.8,188.8,1071.6


In [30]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.765771,0.248751,0.808129,0.702761,0.607551,0.743497
recall,0.786625,0.488259,0.626113,0.702761,0.633666,0.702761
f1-score,0.776043,0.329502,0.705542,0.702761,0.603696,0.715696
support,2411.6,332.2,1711.2,0.702761,4455.0,4455.0
accuracy,0.786625,0.488259,0.626113,,,


In [31]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.792282,0.199082,0.838449,0.65385,0.609938,0.767091
recall,0.694484,0.699318,0.587971,0.65385,0.660591,0.65385
f1-score,0.740166,0.309933,0.691218,0.65385,0.580439,0.690177
support,24182.0,3226.0,17142.0,0.65385,44550.0,44550.0


In [32]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,16794,5661,1727
functional needs repair,755,2256,215
non functional,3648,3415,10079


# Second Model – Decision Tree

In [34]:
dtc = DecisionTreeClassifier()

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 5, 10],
    'min_samples_split': [5, 10, 20, 40],
    'min_samples_leaf': [5, 10, 20],
    'splitter': ['best', 'random']
}

gs_tree = GridSearchCV(dtc, param_grid, cv=3)
gs_tree.fit(X_train_resampled, y_train_resampled)

gs_tree.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'splitter': 'best'}

In [42]:
dtc = DecisionTreeClassifier(criterion= 'gini', max_depth= 10, min_samples_split= 5, min_samples_leaf=10, splitter='best')

In [43]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, dtc)

In [44]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2234.0,14.8,166.0
functional needs repair,225.0,42.0,44.8
non functional,692.6,11.0,1024.8


In [45]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.708882,0.624375,0.830792,0.74092,0.72135,0.750382
recall,0.925017,0.135085,0.592798,0.74092,0.550967,0.74092
f1-score,0.802579,0.221198,0.69128,0.74092,0.571685,0.718736
support,2414.8,311.8,1728.4,0.74092,4455.0,4455.0
accuracy,0.925017,0.135085,0.592798,,,


In [46]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.703851,0.469197,0.806619,0.724242,0.659889,0.726402
recall,0.885824,0.233726,0.588613,0.724242,0.569388,0.724242
f1-score,0.784422,0.312022,0.680584,0.724242,0.592343,0.710259
support,24182.0,3226.0,17142.0,0.724242,44550.0,44550.0


In [47]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,21421,622,2139
functional needs repair,2192,754,280
non functional,6821,231,10090


# Third Model - K Nearest Neighbors

In [48]:
knn = KNeighborsClassifier(n_neighbors=3)

In [49]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, knn)

In [50]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2052.2,71.2,296.4
functional needs repair,173.6,94.0,53.8
non functional,443.8,32.4,1237.6


In [51]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.76872,0.476552,0.77944,0.759551,0.674904,0.751884
recall,0.84809,0.293023,0.722162,0.759551,0.621092,0.759551
f1-score,0.806448,0.362055,0.749665,0.759551,0.639389,0.752558
support,2419.8,321.4,1713.8,0.759551,4455.0,4455.0
accuracy,0.84809,0.293023,0.722162,,,


In [52]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.850543,0.592842,0.86802,0.837104,0.770469,0.838607
recall,0.883922,0.626472,0.810699,0.837104,0.773698,0.837104
f1-score,0.866911,0.609194,0.838381,0.837104,0.771495,0.837271
support,24182.0,3226.0,17142.0,0.837104,44550.0,44550.0


In [53]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,21375,987,1820
functional needs repair,912,2021,293
non functional,2844,401,13897


# Fourth Model – Bagging Classifier

In [57]:
bagged_tree = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, max_features=50)

In [58]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, bagged_tree)

In [59]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2277.8,3.2,143.2
functional needs repair,260.4,18.2,39.2
non functional,575.4,2.8,1134.8


In [60]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.731602,0.755067,0.861417,0.770101,0.782696,0.782876
recall,0.939593,0.057342,0.662446,0.770101,0.553127,0.770101
f1-score,0.822617,0.106198,0.748883,0.770101,0.559233,0.743188
support,2424.2,317.8,1713.0,0.770101,4455.0,4455.0
accuracy,0.939593,0.057342,0.662446,,,


In [61]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.841039,0.815803,0.957256,0.877935,0.871366,0.88393
recall,0.97341,0.50248,0.813907,0.877935,0.763266,0.877935
f1-score,0.902396,0.621907,0.879781,0.877935,0.801361,0.873383
support,24182.0,3226.0,17142.0,0.877935,44550.0,44550.0


In [62]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,23539,212,431
functional needs repair,1413,1621,192
non functional,3036,154,13952


# Fifth Model – Random Forest

In [64]:
forest = RandomForestClassifier(criterion='entropy', max_depth=10, min_samples_leaf=5, min_samples_split=10)

In [65]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, forest)

In [66]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2320.8,5.6,97.6
functional needs repair,284.0,14.4,33.4
non functional,721.6,2.2,975.4


In [67]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.697673,0.652292,0.88173,0.74312,0.743898,0.764486
recall,0.957408,0.043506,0.574076,0.74312,0.524997,0.74312
f1-score,0.807155,0.081418,0.695378,0.74312,0.527983,0.710454
support,2424.0,331.8,1699.2,0.74312,4455.0,4455.0
accuracy,0.957408,0.043506,0.574076,,,


In [68]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.703478,0.56701,0.894614,0.746038,0.721701,0.767142
recall,0.949384,0.204588,0.561078,0.746038,0.571683,0.746038
f1-score,0.808138,0.300683,0.689635,0.746038,0.599486,0.725794
support,24182.0,3226.0,17142.0,0.746038,44550.0,44550.0


In [69]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,22958,353,871
functional needs repair,2304,660,262
non functional,7373,151,9618


# Sixth Model – XGBoost

In [76]:
xgboost_y_train_resampled = y_train_resampled.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})
xgboost_y_train = y_train.map({'non functional': 0, 'functional needs repair': 1, 'functional': 2})

xgb = XGBClassifier()

In [79]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, xgboost_y_train, X_train_resampled, xgboost_y_train_resampled, 5, xgb)

In [80]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,1249.0,12.6,463.6
functional needs repair,50.2,61.2,200.8
non functional,185.2,26.6,2205.8


In [81]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.841534,0.609051,0.768457,0.789226,0.739681,0.78583
recall,0.724012,0.196326,0.912392,0.789226,0.61091,0.789226
f1-score,0.778305,0.295724,0.834215,0.789226,0.636081,0.774883
support,1725.2,312.2,2417.6,0.789226,4455.0,4455.0


In [82]:
resampled_report

Unnamed: 0,0,1,2,accuracy,macro avg,weighted avg
precision,0.877565,0.493528,0.799272,0.803771,0.723455,0.807258
recall,0.733403,0.460942,0.899388,0.803771,0.697911,0.803771
f1-score,0.799034,0.476679,0.84638,0.803771,0.707364,0.801391
support,17142.0,3226.0,24182.0,0.803771,44550.0,44550.0


In [83]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,12572,492,4078
functional needs repair,355,1487,1384
non functional,1399,1034,21749


# Eigth Model – Adaboost Classifier

In [119]:
# Instantiate an AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=200, random_state=42)

In [120]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, adaboost_clf)

In [121]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,1976.2,103.2,319.0
functional needs repair,170.2,100.2,60.0
non functional,393.2,50.2,1282.8


In [122]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.778125,0.393842,0.771855,0.754029,0.64794,0.747509
recall,0.823992,0.302526,0.743393,0.754029,0.623304,0.754029
f1-score,0.800335,0.341955,0.757216,0.754029,0.633169,0.749759
support,2398.4,330.4,1726.2,0.754029,4455.0,4455.0


In [123]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.938723,0.786352,0.971796,0.9389,0.898957,0.940415
recall,0.964188,0.846559,0.920604,0.9389,0.910451,0.9389
f1-score,0.951285,0.815346,0.945508,0.9389,0.904046,0.939218
support,24182.0,3226.0,17142.0,0.9389,44550.0,44550.0


In [124]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,23316,495,371
functional needs repair,408,2731,87
non functional,1114,247,15781


# Ninth Model – Gradient Boosting Classifier

In [113]:
# Instantiate an GradientBoostingClassifier
gbt_clf = GradientBoostingClassifier(random_state=42, n_estimators=200, max_features=50)

In [114]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, gbt_clf)

In [115]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2226.2,15.4,169.0
functional needs repair,240.4,40.8,43.2
non functional,587.4,8.6,1124.0


In [116]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.728955,0.629701,0.841203,0.761167,0.733286,0.765186
recall,0.923533,0.126205,0.653524,0.761167,0.567754,0.761167
f1-score,0.814757,0.209807,0.735526,0.761167,0.586697,0.740105
support,2410.6,324.4,1720.0,0.761167,4455.0,4455.0


In [117]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.750978,0.390605,0.845671,0.753603,0.662418,0.761318
recall,0.881317,0.376317,0.644441,0.753603,0.634025,0.753603
f1-score,0.810943,0.383328,0.731468,0.753603,0.641913,0.749398
support,24182.0,3226.0,17142.0,0.753603,44550.0,44550.0


In [118]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,21312,1246,1624
functional needs repair,1620,1214,392
non functional,5447,648,11047


# Eleventh Model – Extra Randomized Trees

In [101]:
extra_trees = ExtraTreesClassifier(n_estimators=50, random_state=42)

In [102]:
reports, matrices, numpy_report, numpy_matrix, resampled_report, resampled_matrix = cval(X_train, y_train, X_train_resampled, y_train_resampled, 5, extra_trees)

In [103]:
numpy_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,2074.4,81.4,264.8
functional needs repair,169.8,104.8,53.4
non functional,385.4,37.2,1283.8


In [104]:
numpy_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.788849,0.471024,0.801374,0.777329,0.687082,0.770416
recall,0.856952,0.320728,0.752284,0.777329,0.643321,0.777329
f1-score,0.821454,0.380317,0.776045,0.777329,0.659272,0.771609
support,2420.6,328.0,1706.4,0.777329,4455.0,4455.0


In [105]:
resampled_report

Unnamed: 0,functional,functional needs repair,non functional,accuracy,macro avg,weighted avg
precision,0.938546,0.782361,0.973312,0.938855,0.898073,0.940614
recall,0.964395,0.852449,0.919088,0.938855,0.911977,0.938855
f1-score,0.951295,0.815903,0.945423,0.938855,0.904207,0.939231
support,24182.0,3226.0,17142.0,0.938855,44550.0,44550.0


In [106]:
resampled_matrix

Unnamed: 0,functional,functional needs repair,non functional
functional,23321,495,366
functional needs repair,410,2750,66
non functional,1117,270,15755
