# Cognitive, behavioral and social data
**DATASET**: PCL5  
**Author**: Mattia Brocco

MERGE OF DATASETS FOR **R_NEO_PI**
```python
a = pd.read_excel(data_dir + "\\R_NEO_PI_Faked.xlsx")
b = pd.read_excel(data_dir + "\\R_NEO_PI_Honest.xlsx")

a.columns = [" ".join([pd.Series(a.columns).apply(lambda s: np.nan if "Unnamed"
                                                  in s else s).fillna(method = "ffill").tolist()[i],
                       a.loc[0][i]]) for i in range(len(a.columns))]
b.columns = [" ".join([pd.Series(b.columns).apply(lambda s: np.nan if "Unnamed"
                                                  in s else s).fillna(method = "ffill").tolist()[i],
                       b.loc[0][i]]) for i in range(len(b.columns))]

a = a.drop(0).reset_index(drop = True)
b = b.drop(0).reset_index(drop = True)

a["CONDITION"] = "FAKE"
b["CONDITION"] = "HONEST"

pd.concat([a, b], ignore_index = True).to_excel(data_dir + "\\R_NEO_PI.xlsx", index = False)
```

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import support
from enginev2 import Classification

%load_ext autoreload
%autoreload 2

data_dir = ".\\data"

pd.options.display.max_columns = 500

In [2]:
from sklearn.decomposition import PCA

In [3]:
# Organize datasets
data_collection = {}
for dataset in [f for f in os.listdir(data_dir) if "feather" in f]:
    print(dataset.split(".")[0])
    
    a, b, c, d = Classification().prepare_data(f"{data_dir}\\{dataset}", "CONDITION")
    
    data_collection[dataset.split(".")[0]] = [a, b, c, d]

BF_df_CTU
BF_df_OU
BF_df_V
DT_df_CC
DT_df_JI
IADQ_df
IESR_df
NAQ_R_df
PCL5_df
PHQ9_GAD7_df
PID5_df
PRFQ_df
PRMQ_df
RAW_DDDT
R_NEO_PI


KeyboardInterrupt: 

In [4]:
X_train, X_test, y_train, y_test = data_collection["BF_df_CTU"]

In [None]:
#a = pd.DataFrame(np.c_[new_X_train, y_train])

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier

pruned_tree = support.D3_pruning(new_X_train, y_train)

random_forest = GradientBoostingClassifier(random_state = 42,
                                           ccp_alpha = pruned_tree.best_params_["ccp_alpha"])
random_forest.fit(new_X_train, y_train)

# 1.3 Feature selection
perm_imp = permutation_importance(random_forest, pca2.transform(X_test),
                                  y_test, n_repeats = 30,
                                  random_state = 42, scoring = "accuracy",
                                  n_jobs = -1)

In [None]:
pd.DataFrame(perm_imp["importances"]).T.plot();

In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(random_state = 42, n_jobs = -1)
logit.fit(new_X_train[:, 0].reshape(-1, 1), y_train)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, logit.predict(pca2.transform(X_test)[:, 0].reshape(-1, 1)))

In [None]:
covariance_mat = np.cov(X_train, rowvar = False)
    
eigenvalues, eigenvectors = np.linalg.eig(covariance_mat)
sorted_indeces = np.flip(np.argsort(eigenvalues))
P = eigenvectors[:, sorted_indeces]
W = P[:, 0:3]
Y_hat = np.dot(X_train, W)
W_hat = W.T
X_hat = np.dot(Y_hat, W_hat)
    
#approx_data = scaler.inverse_transform(X_hat)
#ecg_sig_rec = matrix_to_signal(approx_data, original_len)
    

https://stackoverflow.com/questions/50796024/feature-variable-importance-after-a-pca-analysis

In [None]:
plt.figure(figsize = (15, 6))
sns.heatmap(np.power(pca2.inverse_transform(new_X_train) - X_train, 2), yticklabels = False);

In [None]:
new_X_train[:5]

In [None]:
(309 x 10)*(10 x 3)

In [None]:
np.dot(pca2.components_, X_train.T).T

In [None]:
new_X_train[:5]

In [None]:
from sklearn.inspection import partial_dependence
from sklearn.inspection import PartialDependenceDisplay

PartialDependenceDisplay.from_estimator(random_forest, new_X_train, [0, 1, 2],
                                        kind = "average", method = "recursion")

In [None]:
plt.plot(np.linspace(new_X_train[:,0].min(), new_X_train[:,0].max(), num = len(prova["values"][0])), prova["values"][0])
plt.plot(np.linspace(new_X_train[:,0].min(), new_X_train[:,0].max(), num = len(prova["values"][0])), prova["values"][1])
plt.plot(np.linspace(new_X_train[:,0].min(), new_X_train[:,0].max(), num = len(prova["values"][0])), prova["values"][2])

In [None]:
PartialDependenceDisplay.from_estimator(pruned_tree.best_estimator_, new_X_train, [0, 1, 2])