# Cognitive, behavioral and social data
**DATASET**: PCL5  
**Author**: Mattia Brocco

MERGE OF DATASETS FOR **R_NEO_PI**
```python
a = pd.read_excel(data_dir + "\\R_NEO_PI_Faked.xlsx")
b = pd.read_excel(data_dir + "\\R_NEO_PI_Honest.xlsx")

a.columns = [" ".join([pd.Series(a.columns).apply(lambda s: np.nan if "Unnamed"
                                                  in s else s).fillna(method = "ffill").tolist()[i],
                       a.loc[0][i]]) for i in range(len(a.columns))]
b.columns = [" ".join([pd.Series(b.columns).apply(lambda s: np.nan if "Unnamed"
                                                  in s else s).fillna(method = "ffill").tolist()[i],
                       b.loc[0][i]]) for i in range(len(b.columns))]

a = a.drop(0).reset_index(drop = True)
b = b.drop(0).reset_index(drop = True)

a["CONDITION"] = "FAKE"
b["CONDITION"] = "HONEST"

pd.concat([a, b], ignore_index = True).to_excel(data_dir + "\\R_NEO_PI.xlsx", index = False)
```

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import engine
import support

%load_ext autoreload
%autoreload 2

data_dir = ".\\data"

pd.options.display.max_columns = 500

---
## Try to design a pipeline
***

SOURCES  
* [Factor Analysis](https://www.statisticssolutions.com/free-resources/directory-of-statistical-analyses/factor-analysis/)
* [Likelihood-ratio test](https://en.wikipedia.org/wiki/Likelihood-ratio_test)
* [How can I get statistics to compare nested models in a logistic regression in SPSS?](https://www.ibm.com/support/pages/how-can-i-get-statistics-compare-nested-models-logistic-regression-spss)

In [2]:
### PIPELINE
# Organize datasets
data_collection = {}
for dataset in [f for f in os.listdir(data_dir) if "feather" in f]:#os.listdir(data_dir):
    print(dataset.split(".")[0])
    a, b, c, d = engine.Classification().prepare_data(f"{data_dir}\\{dataset}", "CONDITION")
    e = engine.Classification().variable_selection(a, b, c, d)
    f = engine.Classification().benchmark_models(a, b, c, d, e)
    data_collection[dataset.split(".")[0]] = [a, b, c, d, e, f]
    print(e)
    print(f)
    print("-" * 50)
    print()

BF_df_CTU
{'Features': [6, 4], 'Validation passed': False}
Logistic Regression    0.781955
SVC                    0.827068
Random Forest          0.827068
Neural Network         0.781955
dtype: float64
--------------------------------------------------

BF_df_OU
{'Features': [6, 4], 'Validation passed': False}
Logistic Regression    0.782609
SVC                    0.804348
Random Forest          0.804348
Neural Network         0.760870
dtype: float64
--------------------------------------------------

BF_df_V
{'Features': [4, 7], 'Validation passed': False}
Logistic Regression    0.719178
SVC                    0.719178
Random Forest          0.726027
Neural Network         0.691781
dtype: float64
--------------------------------------------------

DT_df_CC
{'Features': [9, 8], 'Validation passed': False}
Logistic Regression    0.675862
SVC                    0.696552
Random Forest          0.675862
Neural Network         0.600000
dtype: float64
----------------------------------------

In [None]:
# PROBLEMS IN
# - IESR_df.csv
# - DT_df_JI.csv

In [None]:
X_train, X_test, y_train, y_test = engine.Classification().prepare_data(f"{data_dir}\\BF_df_CTU.feather", "CONDITION")

In [None]:
perm_imp

In [None]:
perm_imp = permutation_importance(rf, X_test, y_test, n_repeats = 30,
                                  random_state = 42, scoring = "accuracy")

In [None]:
selected_features = []        
for i in perm_imp.importances_mean.argsort()[::-1]:
    if perm_imp.importances_mean[i] - 2 * perm_imp.importances_std[i] > 0:

        selected_features += [i]

if len(selected_features) == 0:
    selected_features = np.where(np.abs(perm_imp["importances_mean"]) > 1e-2)[0]
    if len(selected_features) == 1:
        selected_features = [selected_features]
    elif len(selected_features) > 1:
        selected_features = list(selected_features)

In [None]:
selected_features

In [None]:
engine.Classification().variable_selection(X_train, X_test, y_train, y_test)

In [None]:
lr = LogisticRegression(n_jobs = -1, random_state = 42, max_iter = 5e3)
lr.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, lr.predict(X_test)))

## Instead, use a paper from literature
SOURCES
* [Model-agnostic Feature Importance and Effects with Dependent Features -- A Conditional Subgroup Approach](https://arxiv.org/abs/2006.04628)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

In [None]:
X_train, X_test, y_train, y_test = engine.Classification().prepare_data(f"{data_dir}\\BF_df_CTU.csv", "CONDITION")

lr = LogisticRegression(n_jobs = -1, random_state = 42, max_iter = 5e3)
lr.fit(X_train, y_train)

In [None]:
pi = permutation_importance(lr, X_test, y_test, n_repeats = 30,
                            random_state = 42, scoring = "accuracy")

In [None]:
plt.bar(x = range(len(pi["importances_mean"])), height = pi["importances_mean"])
plt.scatter(range(len(pi["importances_mean"])),
            pi["importances_mean"] + pi["importances_std"], color = "orange")
plt.scatter(range(len(pi["importances_mean"])),
            pi["importances_mean"] - pi["importances_std"], color = "orange")
plt.show()