In [6]:
from modelling import binary_classifier as ds_classifier
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier

ImportError: attempted relative import with no known parent package

In [2]:
from dataset_preprocessing import preprocessing as ds_prep
from tqdm import tqdm
from dataset_EDA import eda as ds_eda
from dataset_elt import dataset_extraction as ds_ext
import logging

logging.basicConfig(filename='classifier.log',
                    level=logging.INFO, 
                    format='%(asctime)s %(message)s')
logger = logging.getLogger("Dataset_eda")


 Load dataset:

In [3]:
dataset_location = r'.\datasets\precipitations_df.csv'

ds_extractor = ds_ext.Dataset_extraction(dataset_location)

ontology = "predictionmodel"
columns_names = ['tmp0', 'tmp1', 'hPa', 'hum', 'pp']
limit_rows_number = 5000
# csv mode
ds_extractor.dataset_location = dataset_location
precipitations_df = ds_extractor.load_dataset(csv_mode=True, separator=',')


"\nprecipitations_df = ds_extractor.load_dataset(ontology_name=ontology, \n                                              columns_names=columns_names, \n                                              desired_query_base=None, \n                                              limit_rows_number=5000, \n                                              ontology_first_level_name=None, \n                                              ontology_items_field_name='items')\n"

In [4]:
precipitations_df.drop(['Unnamed: 0'], axis=1, inplace=True)
precipitations_df.head()


Unnamed: 0,tmp0,tmp1,hPa,hum,pp
0,39.89,51.48,1092.0,0.93,0.0
1,7.2,7.01,936.0,0.57,0.0
2,18.55,20.12,981.0,0.66,0.0
3,16.77,16.77,1053.0,0.61,0.0
4,41.3,35.17,1018.0,0.14,0.0


 Exploratory Data Analysis (EDA) steps:
    - a first view on the dataframe content: length, some of the first and
      last rows
    - generation of a profile report in HTML format, containing exploratory
      analysis info
      likeattributes correlations, descriptive statistics values, Pearson's
      correlation matrix,
      outlier detections, missing values detections...
    - custom functions to get this info per atribute

In [5]:
eda_obj = ds_eda.Dataset_eda(precipitations_df)
df_length, head_df, tail_df = eda_obj.check_dataframe_content()


eda_obj.profile_dataframe(
    output_file_location_name=".\\dataset_EDA\\eda_reports\\ \
                               precipitations_dataset_eda_report.html")
 Attributes discarded due to high correlation over a defined threshold: 0,9

In [6]:
rejected_attrs = eda_obj.get_rejected_attributes(correlation_threshold=0.9)
if rejected_attrs is not None:
  precipitations_df.drop(rejected_attrs, axis=1, inplace=True)
precipitations_df.columns


Index(['tmp0', 'tmp1', 'hPa', 'hum', 'pp'], dtype='object')

 Check for any missing values
    If there is a row with more than 50% of the attributes with missing values,
    drop the row.
    Otherwise, impute the missing value in that attribute.

In [7]:
row_indexes_to_delete = eda_obj.check_row_indexes_to_delete(0.5)
row_indexes_to_delete


Int64Index([], dtype='int64')

In [8]:
"""
  And now, we check for each attribute, any possible missing values
"""
attributes_missing_counts_dict = {}
for attribute in tqdm(precipitations_df.columns):
    attribute_missing_sub_df = eda_obj.check_for_missing_values(attribute)
    if attribute_missing_sub_df is not None:
        attributes_missing_counts_dict[attribute] = len(
            attribute_missing_sub_df)

attributes_missing_counts_dict

100%|██████████| 5/5 [00:00<00:00, 1672.10it/s]


{}

In [9]:
ds_preprocessor = ds_prep.Preprocessing(precipitations_df)
attributes_names = precipitations_df.columns[:-1]
target_name = precipitations_df.columns[-1]
ds_bin_classifier = ds_classifier.Binary_classifier(
    precipitations_df, attributes_names, target_name)
# let's make our target attribute binary:
precipitations_df[target_name] = ds_preprocessor.binarize_target_variable(
    precipitations_df[target_name])
precipitations_df[target_name] = precipitations_df[target_name].apply(
    lambda x: np.int(x))

ds_preprocessor = ds_prep.Preprocessing(precipitations_df)


In [10]:
X_train, X_validation, y_train, y_validation = ds_bin_classifier.split_into_train_validation_sets(0.3)


In [11]:
X_train_scaled = ds_preprocessor.standard_scaler_transformer(
    X_train, X_train.columns)
X_train = None


In [12]:
#X_validation_scaled = ds_preprocessor.standard_scaler_transformer(X_validation, X_validation.columns)
import pickle

X_validation_scaled = pickle.load(open("preprocessor_scaler.pickle", "rb"))
X_validation_scaled = X_validation_scaled.transform(X_validation.values) 
X_validation = None


In [13]:
models_and_params = {'DummyClassifier': {'strategy': ['most_frequent']},
                     'GaussianNB': {'var_smoothing': [1e-09, 1e-08, 1e-10]},
                     'LogisticRegression': {'solver': ['liblinear'],
                                            'penalty': ['l1', 'l2'],
                                            'C': [1, 0.1, 0.01]},
                     'SVC': {'C': [1, 0.1, 0.01], 'gamma': ['scale', 'auto'],
                             'class_weight': ['balanced']}}

Dummy_clf = DummyClassifier()
GaussianNB_clf = GaussianNB()
LogisticRegression_clf = LogisticRegression()
SVC_clf = SVC()
models_list = [Dummy_clf, GaussianNB_clf, LogisticRegression_clf, SVC_clf]


In [14]:
cv_results_df, best_estimators_dict = \
    ds_bin_classifier.select_model_via_grid_search_cv(models_list,
                                                    models_and_params,
                                                    X_train_scaled,
                                                    y_train.values,
                                                    cv_folds=10,
                                                    scoring_metrics=['recall',
                                                                     'f1',
                                                                     'roc_auc'],
                                                    refit_metric='roc_auc')

cv_results_df


100%|██████████| 4/4 [00:16<00:00,  4.03s/it]


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_f1,mean_test_recall,mean_test_roc_auc,mean_train_f1,mean_train_recall,mean_train_roc_auc,param_C,param_class_weight,...,split9_train_recall,split9_train_roc_auc,std_fit_time,std_score_time,std_test_f1,std_test_recall,std_test_roc_auc,std_train_f1,std_train_recall,std_train_roc_auc
0,0.000804,0.001693,0.0,0.0,0.5,0.0,0.0,0.5,,,...,0.0,0.5,0.000402,0.000468,0.0,0.0,0.0,0.0,0.0,0.0
0,0.001586,0.003103,0.715866,0.820274,0.96235,0.712704,0.819779,0.963659,,,...,0.815476,0.962678,0.000487,0.000297,0.030739,0.03859,0.006618,0.004652,0.004289,0.000767
1,0.001592,0.003083,0.715866,0.820274,0.96235,0.712704,0.819779,0.963659,,,...,0.815476,0.962678,0.000496,0.000532,0.030739,0.03859,0.006618,0.004652,0.004289,0.000767
2,0.001797,0.003095,0.715866,0.820274,0.96235,0.712704,0.819779,0.963659,,,...,0.815476,0.962678,0.0004,0.000299,0.030739,0.03859,0.006618,0.004652,0.004289,0.000767
0,0.052071,0.002982,0.929045,0.935611,0.998245,0.929379,0.936849,0.998347,1.0,,...,0.931548,0.998162,0.001764,1.8e-05,0.03013,0.052537,0.001186,0.003615,0.003462,0.000137
1,0.003892,0.002995,0.907505,0.873846,0.998097,0.91676,0.887698,0.998209,1.0,,...,0.869048,0.998067,0.000188,1.7e-05,0.050129,0.081834,0.001347,0.005262,0.010399,0.000154
2,0.011985,0.002672,0.893969,0.841707,0.997745,0.891497,0.836162,0.997812,0.1,,...,0.815476,0.997678,0.000782,0.000458,0.048197,0.07863,0.00132,0.00661,0.011475,0.000176
3,0.0031,0.002993,0.766155,0.632089,0.996068,0.771337,0.631214,0.995972,0.1,,...,0.625,0.995846,0.000296,0.000444,0.087126,0.110846,0.001926,0.006744,0.010287,0.00027
4,0.003197,0.002859,0.0,0.0,0.955627,0.0,0.0,0.955856,0.01,,...,0.0,0.954264,0.000388,0.00055,0.0,0.0,0.007955,0.0,0.0,0.000877
5,0.0025,0.002687,0.040408,0.021408,0.982084,0.040264,0.020553,0.982538,0.01,,...,0.017857,0.981664,0.000499,0.000456,0.053777,0.02869,0.004067,0.005399,0.002808,0.000472


In [16]:
best_estimators_dict

{'DummyClassifier': DummyClassifier(constant=None, random_state=None, strategy='most_frequent'),
 'GaussianNB': GaussianNB(priors=None, var_smoothing=1e-09),
 'LogisticRegression': LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 'SVC': SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False)}

In [20]:
max_mean_test_roc_auc = cv_results_df['mean_test_roc_auc'].max()
max_mean_test_roc_auc

0.9982452271989014

In [23]:
'''
best_estimator_df, best_estimator_object = ds_bin_classifier.choose_best_estimator(cv_results_df, 
                                                         'mean_test_roc_auc',
                                                         best_estimators_dict)
'''
max_mean_test_roc_auc = cv_results_df['mean_test_roc_auc'].max()
#print('best model: {}'.format(best_estimator_df[ 'mean_test_recall']))
best_model_info = cv_results_df[cv_results_df['mean_test_roc_auc']==max_mean_test_roc_auc]
best_model_info


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_f1,mean_test_recall,mean_test_roc_auc,mean_train_f1,mean_train_recall,mean_train_roc_auc,param_C,param_class_weight,...,split9_train_recall,split9_train_roc_auc,std_fit_time,std_score_time,std_test_f1,std_test_recall,std_test_roc_auc,std_train_f1,std_train_recall,std_train_roc_auc
0,0.052071,0.002982,0.929045,0.935611,0.998245,0.929379,0.936849,0.998347,1,,...,0.931548,0.998162,0.001764,1.8e-05,0.03013,0.052537,0.001186,0.003615,0.003462,0.000137


In [28]:
best_estimators_dict

{'DummyClassifier': DummyClassifier(constant=None, random_state=None, strategy='most_frequent'),
 'GaussianNB': GaussianNB(priors=None, var_smoothing=1e-09),
 'LogisticRegression': LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                    warm_start=False),
 'SVC': SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False)}

In [29]:
best_model_info.params

0    {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Name: params, dtype: object

In [30]:
best_estimators_dict['LogisticRegression']

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
selected_model = best_estimators_dict['LogisticRegression']
pickle.dump(selected_model, open("selected_model.pickle", "wb"))
selected_model_loaded = pickle.load(open("selected_model.pickle", "rb"))
selected_model_loaded.predict(X_validation_scaled[3].reshape(1, -1))



array([0], dtype=int64)