Wir haben folgende Modelle ausprobiert: BalancedBaggingClassifier, RandomForst, XGBoostClassifier, LGBMClassifier usw. Insgesamt ca. 15 Modelle. I. A. - Verarbeitung vom Datensatz, Features Vorbereitung, Modelauswahl, Prognose. L. L. - Hyperparmeter und Hyperparameter Optimization, Modelauswahl.

In [None]:
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import pandas as pd
from scipy.stats import randint
import numpy as np
import pandas as pd
from scipy.stats import uniform
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from scipy.stats import randint
from sklearn.model_selection import cross_validate, train_test_split
import matplotlib.pyplot as plt
import seaborn as sns 
import datetime
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from imblearn.ensemble import BalancedBaggingClassifier
from lightgbm import LGBMClassifier
from pandas import cut
from natsort import index_natsorted
from xgboost import XGBClassifier

In [None]:
prediction_dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/' +
    'master/datasets/prediction-challenge/prediction-dataset.csv', 
    index_col='identifier', parse_dates=['date'])

dataset = pd.read_csv(
    'https://raw.githubusercontent.com/saschaschworm/big-data-and-data-science/' +
    'master/datasets/prediction-challenge/dataset.csv', 
    index_col='identifier', parse_dates=['date'])



# **Verarbeitung**

In [None]:
dataset['year']=dataset['date'].dt.year
dataset['month']=dataset['date'].dt.month
dataset['day']=dataset['date'].dt.day
dataset['weekday']=dataset['date'].dt.weekday
dataset['mean_call']=dataset['duration']/(dataset['n_contacts_campaign']+dataset['n_contacts_before'])
dataset['age_group']=pd.cut(dataset.age, bins=[0,20,40,60,80,100], labels=['0-20','20-40','40-60','60-80','80-100'])

In [None]:
dataset.groupby(['success']).size()

success
No     32893
Yes     4176
dtype: int64

In [None]:
n = 4000
sample_yes = dataset[dataset['success'] == 'Yes'].sample(n=n, replace=False, random_state=1909)
sample_no = dataset[dataset['success'] == 'No'].sample(n=n, replace=False, random_state=1909)
dataset = pd.concat([sample_yes, sample_no])

In [None]:
X, y = dataset.drop(['success','date'], axis=1), dataset['success']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 1909)

In [None]:
hyperparams = {
}

classifier = XGBClassifier(random_state=1909)



# **Features**

In [None]:
categorical_features = ['education','job', 'credit_default','year','month','previous_conversion','personal_loan','communication_type','day', 'weekday', 'marital_status','housing_loan','age_group']
numeric_features = ['n_contacts_before','days_since_last_contact','duration','n_contacts_campaign','mean_call','age']

categorical_transformer = Pipeline([
    ('OneHotEncoder', OneHotEncoder(handle_unknown = "ignore")),
])




numeric_transformer = Pipeline([
    ('scaler', MinMaxScaler()),
    
])

preprocessor = ColumnTransformer([
    ('categorical_transformer', categorical_transformer, categorical_features),
    ('numeric_transformer', numeric_transformer, numeric_features)],

remainder='passthrough',
)



pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', classifier)],
  
)
#RandomOverSampler(random_state=1909, return_indices=False, ratio=None)



In [None]:
scorer = make_scorer(f1_score, pos_label='Yes')

In [None]:
res_cv = cross_validate(pipeline, X_train, y_train, scoring=scorer, cv=10, n_jobs=-1, verbose=1, return_train_score=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.4s finished


# **Hyperparameter**

In [None]:
colsample_bynode = uniform(0.8,0.3)
gamma = uniform(0,0.5)
reg_alpha = [1,2,3]
learning_rate = uniform(0.02,0.2)
max_depth = [int(x) for x in np.linspace(start = 1, stop = 100, num = 50)]
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 50)]
subsample = uniform (0.7,0.2)

param_distributions = {

  'classifier__colsample_bynode': colsample_bynode,
  'classifier__gamma': gamma,
  'classifier__learning_rate': learning_rate,
  'classifier__max_depth': max_depth,
  'classifier__n_estimators': n_estimators,
  'classifier__subsample': subsample,
  'classifier__reg_alpha': reg_alpha,

   }
  



# **Hyperparameter Optimization**

In [None]:
search = RandomizedSearchCV(
    pipeline, param_distributions = param_distributions, n_iter=5, scoring=scorer, 
    n_jobs=-1,  cv=10, random_state=1909, return_train_score=True)

In [None]:
search = search.fit(X_train, y_train)


In [None]:
training_score = search.cv_results_['mean_train_score'][search.best_index_] * 100
test_score = search.cv_results_['mean_test_score'][search.best_index_] * 100

In [None]:
f'Mean F1 Score (Training/Test): {training_score:.2f}%/{test_score:.2f}%'

'Mean F1 Score (Training/Test): 89.72%/88.57%'

In [None]:
search.best_params_

{'classifier__colsample_bynode': 0.929204196565546,
 'classifier__gamma': 0.25853683693019985,
 'classifier__learning_rate': 0.0876034856703405,
 'classifier__max_depth': 3,
 'classifier__n_estimators': 130,
 'classifier__reg_alpha': 1,
 'classifier__subsample': 0.7619993062149083}

In [None]:
y_pred = search.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.92      0.85      0.88      1014
         Yes       0.86      0.92      0.89       986

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



In [None]:
n_estimators = [129,130,131,132,133]
gamma = [0.1]
reg_alpha = [1]
learning_rate = [0.07]
max_depth = [4]
colsample_bynode = [0.7,0.8,0.9]
subsample = [0.8]
params =   {'classifier__colsample_bynode': colsample_bynode,
  'classifier__gamma': gamma,
  'classifier__learning_rate': learning_rate,
  'classifier__max_depth': max_depth,
  'classifier__n_estimators': n_estimators,
  'classifier__subsample': subsample,
  'classifier__reg_alpha': reg_alpha}

In [None]:
search = GridSearchCV(
    pipeline, param_grid = params, scoring=scorer, 
    n_jobs=-1,  cv=10,  return_train_score=True)

In [None]:
search = search.fit(X_train, y_train)

In [None]:
training_score = search.cv_results_['mean_train_score'][search.best_index_] * 100
test_score = search.cv_results_['mean_test_score'][search.best_index_] * 100

In [None]:
f'Mean F1 Score (Training/Test): {training_score:.2f}%/{test_score:.2f}%'

'Mean F1 Score (Training/Test): 90.45%/88.80%'

In [None]:
y_pred = search.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.92      0.85      0.88      1014
         Yes       0.85      0.92      0.89       986

    accuracy                           0.88      2000
   macro avg       0.89      0.88      0.88      2000
weighted avg       0.89      0.88      0.88      2000



In [None]:
search.best_params_

{'classifier__colsample_bynode': 0.9,
 'classifier__gamma': 0.1,
 'classifier__learning_rate': 0.07,
 'classifier__max_depth': 4,
 'classifier__n_estimators': 129,
 'classifier__reg_alpha': 1,
 'classifier__subsample': 0.8}

# **Prediction**

In [None]:
prediction_dataset['year']=prediction_dataset['date'].dt.year
prediction_dataset['month']=prediction_dataset['date'].dt.month
prediction_dataset['day']=prediction_dataset['date'].dt.day
prediction_dataset['weekday']=prediction_dataset['date'].dt.weekday
prediction_dataset['mean_call']=prediction_dataset['duration']/(prediction_dataset['n_contacts_campaign']+prediction_dataset['n_contacts_before'])
prediction_dataset['age_group']=pd.cut(prediction_dataset.age, bins=[0,20,40,60,80,100], labels=['0-20','20-40','40-60','60-80','80-100'])

In [None]:
predictions = search.best_estimator_.predict(prediction_dataset)



In [None]:
submission = pd.DataFrame(
    predictions, index=prediction_dataset.index, columns=['prediction'])

In [None]:
matriculation_number = '00000'

In [None]:
submission.to_csv(
    f'./submission-{matriculation_number}.csv', index_label='identifier')

In [None]:
from google.colab import files
files.download('submission-00000.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>