In [1]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict, StratifiedKFold

import pickle, numpy as np

from utilities import get_train_data, get_test_data

PICKLE_FOLDER_PATH = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/Learning_Alg/GermEval-2018-Data/'

TRAIN_FILENAME = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/germeval2018.training.txt'
TEST_FILENAME  = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/germeval2018.test.txt'
mdp_file = '/home/lisa/Darmstadt/Master Arbeit/06_Analyse/mdp_tweets.txt'

In [2]:
X_CNGR_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_CNGR_train.p", "rb" ))
X_CNGR_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_CNGR_test.p", "rb" ))
X_CNGR_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_CNGR_mdp.p", "rb" ))

X_TNGR_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_TNGR_train.p", "rb" ))
X_TNGR_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_TNGR_test.p", "rb" ))
X_TNGR_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_TNGR_mdp.p", "rb" ))


X_CIMP_task1_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_CIMP_task1_train.p", "rb" ))
X_CIMP_task1_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_CIMP_task1_test.p", "rb" ))
X_CIMP_task1_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_CIMP_task1_mdp.p", "rb" ))


X_CIMP_task2_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_CIMP_task2_train.p", "rb" ))
X_CIMP_task2_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_CIMP_task2_test.p", "rb" ))
X_CIMP_task2_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_CIMP_task2_mdp.p", "rb" ))


X_TIMP_task1_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_TIMP_task1_train.p", "rb" ))
X_TIMP_task1_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_TIMP_task1_test.p", "rb" ))
X_TIMP_task1_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_TIMP_task1_mdp.p", "rb" ))


X_TIMP_task2_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_TIMP_task2_train.p", "rb" ))
X_TIMP_task2_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_TIMP_task2_test.p", "rb" ))
X_TIMP_task2_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_TIMP_task2_mdp.p", "rb" ))


X_EMB_train = pickle.load(open(PICKLE_FOLDER_PATH + "X_EMB_train.p", "rb" ))
X_EMB_test  = pickle.load(open(PICKLE_FOLDER_PATH + "X_EMB_test.p", "rb" ))
X_EMB_mdp  = pickle.load(open(PICKLE_FOLDER_PATH + "X_EMB_mdp.p", "rb" ))

In [3]:
_, y1, y2 = get_train_data(TRAIN_FILENAME)

#### Funktion für das Aufteilen in Train und Test Sample 
-> StratifiedKFold sorgt dafür, dass das prozentuale Verhältnis der Klassen im jeweiligen Sample (Test, Train) gleich ist

In [4]:
def get_META_feats(clf, X_train, mdp, y, seeds=[42]):
    feats_train = []
    for seed in seeds:
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
        feats_train.append(cross_val_predict(clf, X_train, y=y, method='predict_proba', cv=skf, n_jobs=-1))
    feats_train = np.mean(feats_train, axis=0)
    print(len(feats_train))
    print(clf)
    clf.fit(X_train, y)
    feats_mdp = clf.predict_proba(mdp)
    print(len(feats_mdp))
    
    return feats_train, feats_mdp

In [6]:
X_CIMP_task1_mdp.shape

(9957, 6400)

## TASK 1 - Base level predictions
Die drei verschiedenen Classifier (clfs_task1) werden auf die Feature Vectoren (base_feats_task1) angewandt.
Von einer 10-fold CrossVal wird für den Trainings Feature Satz der Durchschnitt genommen (jeder Spalte).
Bei den Test-/mdp Daten wird keine Cross Val durchgeführt (keine y Variablen) sondern nur mit jedem Classifier eine prediction anhand der Feature Vektoren gemacht

In [22]:
%%time
clfs_task1 = [LogisticRegression(class_weight='balanced'),
              ExtraTreesClassifier(n_estimators=100, criterion='entropy', n_jobs=-1),
              ExtraTreesClassifier(n_estimators=100, criterion='gini', n_jobs=-1)]

base_feats_task1 = [(X_CIMP_task1_train, X_CIMP_task1_mdp),
                    (X_TIMP_task1_train, X_TIMP_task1_mdp),
                    (X_CNGR_train, X_CNGR_mdp),
                    (X_TNGR_train, X_TNGR_mdp),
                    (X_EMB_train, X_EMB_mdp)]
X_META_task1_train = []
#X_META_task1_test  = []
X_META_task1_mdp  = []
for X_train, mdp in base_feats_task1:                 # X-train z.B X_CIMP_task1_train, mdp z.B X_CIMP_task1_mdp
    for clf in clfs_task1:
        feats = get_META_feats(clf, X_train, mdp, y1)

        X_META_task1_train.append(feats[0])           # aus "get_META_feats: feats_train
        X_META_task1_mdp.append(feats[1])             # aus "get_META_feats: feats_mdp
        
X_META_task1_train = np.concatenate(X_META_task1_train, axis=1)
X_META_task1_mdp  = np.concatenate(X_META_task1_mdp, axis=1)

5009
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
9957
5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
9957
5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini',

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


9957
5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
9957
5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob



5009
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
9957




5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
9957




5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
9957
5009
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
9957
5009
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='entropy',

## TASK 2  - Base level predictions

In [20]:
# %%time
# clfs_task2 = [LogisticRegression(class_weight='balanced'),
#               ExtraTreesClassifier(n_estimators=150, criterion='entropy', n_jobs=-1),
#               ExtraTreesClassifier(n_estimators=150, criterion='gini', n_jobs=-1)]

# base_feats_task2 = [(X_CIMP_task1_train, X_CIMP_task1_mdp),
#                     (X_TIMP_task1_train, X_TIMP_task1_mdp),
#                     (X_CNGR_train, X_CNGR_mdp),
#                     (X_TNGR_train, X_TNGR_mdp),
#                     (X_EMB_train, X_EMB_mdp)]

# X_META_task2_train = []
# X_META_task2_mdp  = []
# for X_train, mdp in base_feats_task2:
#     for clf in clfs_task2:
#         feats = get_META_feats(clf, X_train, mdp, y2)
#         X_META_task2_train.append(feats[0])
#         X_META_task2_mdp.append(feats[1])
        
# X_META_task2_train = np.concatenate(X_META_task2_train, axis=1)
# X_META_task2_mdp  = np.concatenate(X_META_task2_mdp, axis=1)

5009
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)




9957
5009
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
9957
5009
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
9957




TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

## Meta level predictions
Die Logistische Regression wird auf den Base Level Predictions des Trainingsdatensatzes erst gefittet und anschließend zur Vorhersage genutzt.

In [14]:
X_META_task1_mdp.shape
X_META_task1_train.shape

(5009, 30)

In [15]:
clf_task1 = LogisticRegression(C=0.17, class_weight='balanced')
clf_task1.fit(X_META_task1_train, y1)

# clf_task2 = LogisticRegression(C=0.2, class_weight='balanced')
# clf_task2.fit(X_META_task2_train, y2)  

preds_task1 = clf_task1.predict(X_META_task1_mdp)    
# preds_task2 = clf_task2.predict(X_META_task2_test)

In [20]:
X_test_mdp = get_test_data(mdp_file)
#X_test_mdp[X_test_mdp == ' ']
X_test_mdp = X_test_mdp[X_test_mdp != '']
X_test_mdp = X_test_mdp[X_test_mdp != ' ']

In [21]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)

# import pandas as pd
data = np.array([X_test_mdp, preds_task1])
#print(data[1,])
df = pd.DataFrame({'tweets':data[0,], 'label':data[1,]})
df

ValueError: arrays must all be same length