In [1]:
# Baseline Model

# Model - Logistic Reg,
# Features -
#     * All Channels - Raw, Abs, Mag (8)
#     * All Windows  - 1, 3, 5, 10, 15
#     * All Indiv    - Stat - Mean, Variance, Spec - PSD 6 bins
#     * All Pairwise -
#            - Synch - Correl, lag-Correl, MI, mimicry
#            - Convr - Sym.Conv, Asym.Conv, Glob.Conv
#     * All GroupFeat-
#            - Aggreagtion - Min, Max, Mean, Mode, Var
#            -
# Evaluation - Acc, Conf.Matrix, AUC, Precision, Recall,

In [2]:
import sys  
sys.path.insert(0, '/Users/navinlr/Desktop/Thesis/code_base/conversation_quality')

In [3]:
from modeling import dataset_provider as data_gen
import constants

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, accuracy_score, mean_squared_error, roc_auc_score, r2_score, explained_variance_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from imblearn import under_sampling 
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

Total Groups = 115


Using TensorFlow backend.


In [4]:
# Variables for baseline
random_seed=20
manifest="indiv"
data_split_per=.40
missing_data_thresh=50.0 #(in percent)
convq_thresh=3.0
agreeability_thresh=.2
annotators=["Divya", "Nakul"]#, "Swathi"]
only_involved_pairs=True
splits = 5
if manifest=="group":
    smote_nn = 2
else:
    smote_nn = 6

label_type = "hard"
model_type = "log-reg"
zero_mean  = False

dataset=constants.features_dataset_path_v1

In [5]:
# Functions 
def over_sample_data(temp_X, temp_y, method="SMOTE", k_neighbors=6):
    if method == "SMOTE":
        temp_X, temp_y = SMOTE(k_neighbors=k_neighbors-1).fit_resample(temp_X, temp_y)
    return temp_X, temp_y

def feature_normalize(temp_X, method="min-max"):
    # Fit on training set only.
    if method == "min-max":
        normaliser = MinMaxScaler().fit(temp_X)
    elif method == "mean-var":
        normaliser = StandardScaler().fit(temp_X)
    return normaliser
    
def feature_selection(temp_X, temp_y, method="anova"):
    top_features = []
    if method == "anova":
        f_values, p_values = f_classif(temp_X, temp_y)
        top_features=np.where(np.array(p_values) <= 0.05)[0]
#         print(top_features)
        print("# Top Features = " + str(len(top_features)))
    return top_features

def select_required_features(temp_X, required_feats):
    temp_X=temp_X[:,required_feats]
#     print("After Feature Selection, Features -> " + str(temp_X.shape))
    return temp_X

def dimension_reduction(temp_X, method="pca"):
    dim_red_model = None
    if method=="pca":
        dim_red_model = PCA(.95).fit(temp_X)
    return dim_red_model
    
def process_convq_labels(y, label_type="soft"):
    print("Data-type of labels - " + str(type(y)))
    if label_type=="soft":
        y=list(np.around(np.array(y),2))
    else:
        y=list(np.where(np.array(y) <= convq_thresh, 0, 1))
        print("ConvQ Classes Distribution : (Total = "+ str(len(y)) +")")
        print("High Quality Conv = " + str(sum(y)))
        print("Low Quality Conv = " + str(len(y)-sum(y)))
    return y

def model_convq_manifestation(temp_X, temp_y, model="log-reg"):

    if model == "log-reg":
        model = LogisticRegression(solver='lbfgs', max_iter=1000).fit(temp_X, temp_y)
    elif model == "lin-reg":
        model = LinearRegression().fit(temp_X, temp_y)
    elif model == "adaboost":
        model = AdaBoostClassifier(n_estimators=100).fit(temp_X, temp_y)
    return model

def analyse_model_params(model):
    return True

def test_model(temp_X, model):
    return model.predict(temp_X)

def evaluate_predict(predict_temp_y, test_temp_y, method=accuracy_score):
    score = method(test_temp_y, predict_temp_y)
    return score

In [6]:
# Data Read
X, y, ids = data_gen.get_dataset_for_experiment(dataset=dataset,
                                                    manifest=manifest,
                                                    missing_data_thresh=missing_data_thresh,
                                                    agreeability_thresh=agreeability_thresh,
                                                    annotators=annotators,
                                                    only_involved_pairs=only_involved_pairs,
                                                    zero_mean=zero_mean)

# print(y)

Generating Dataset for modeling - indiv ConvQ, ...........
Number of Groups (After removing missing data) - 85
HEREREERERERERRERer
ZERO-MEAN Technique ? - False
ZERO-MEAN Technique ? - False
[4 4 2 4 2 4 4 4 4 4]
[4 5 1 4 2 4 4 5 4 2]
Current Kappa: 0.5652173913043479
[5 5 1 4 1 5 4 5 5 1]
[4 4 2 5 1 5 4 5 5 4]
Current Kappa: 0.6946564885496183
[4 4 2 4 2 4 4 4 4 2]
[2 3 4 3 4 1 2 2 2 4]
Current Kappa: -0.6666666666666667
[4 4 2 3 2 4 4 4 4 4]
[5 4 2 5 1 4 4 5 5 1]
Current Kappa: 0.4516129032258065
[4 4 2 3 2 4 4 4 4 2]
[5 5 1 4 2 5 4 4 4 1]
Current Kappa: 0.8064516129032258
[4 4 2 3 2 4 4 4 4 2]
[4 4 3 4 2 3 4 4 4 3]
Current Kappa: 0.6923076923076923
[4 5 1 5 2 4 5 4 4 2]
[5 5 1 5 1 5 4 4 4 1]
Current Kappa: 0.7967479674796748
[5 5 1 5 1 5 5 4 4 1]
[4 4 1 4 1 5 4 5 4 1]
Current Kappa: 0.6268656716417911
[4 4 2 4 2 4 2 4 4 2]
[2 2 4 2 4 2 2 2 2 4]
Current Kappa: -0.6666666666666665
[2 2 4 2 2 4 3 4 4 2]
[5 5 1 5 1 5 4 5 5 2]
Current Kappa: 0.08071748878923757
[2 2 4 2 4 4 3 4 4 2]
[5 5

[5 4 2 5 1 5 4 4 5 1]
Current Kappa: 0.6103896103896104
[4 4 2 4 2 4 4 4 4 2]
[4 4 2 4 2 4 4 4 4 2]
Current Kappa: 1.0
[4 4 2 4 2 5 4 4 4 2]
[5 5 1 4 1 5 5 5 5 1]
Current Kappa: 0.6363636363636364
[2 2 4 2 4 2 2 4 4 2]
[3 1 5 1 5 1 1 1 1 5]
Current Kappa: 0.220183486238532
[4 4 2 4 2 4 4 4 4 2]
[4 4 2 4 2 4 4 4 4 2]
Current Kappa: 1.0
[2 3 4 2 4 1 2 4 2 2]
[2 2 4 2 4 1 2 2 2 4]
Current Kappa: 0.5714285714285714
[4 3 2 3 2 3 4 4 2 2]
[4 4 2 4 2 3 4 4 4 2]
Current Kappa: 0.6385542168674699
[4 4 2 4 2 4 4 4 4 2]
[4 4 2 4 2 4 4 4 4 2]
Current Kappa: 1.0
[4 4 2 3 2 3 4 4 4 2]
[4 4 2 4 2 4 4 4 4 2]
Current Kappa: 0.8780487804878049
[2 2 4 2 5 2 2 4 3 2]
[4 4 2 4 2 5 4 5 4 2]
Current Kappa: -0.3157894736842106
[4 3 2 3 2 4 4 4 4 2]
[4 4 2 4 2 4 4 4 4 2]
Current Kappa: 0.8780487804878049
[4 3 2 3 2 4 4 4 5 2]
[5 5 1 5 1 5 4 4 5 1]
Current Kappa: 0.6859903381642511
[2 3 4 3 2 5 2 4 3 2]
[5 5 2 4 2 4 4 4 5 2]
Current Kappa: 0.06896551724137934
[4 4 2 4 2 2 4 4 2 2]
[4 4 2 4 2 3 4 4 4 2]
Current 

[4 4 2 4 2 4 4 4 4 2]
[4 4 1 5 2 4 4 4 5 1]
Current Kappa: 0.6721311475409836
[4 4 2 4 2 4 4 4 4 2]
[4 4 2 4 1 4 4 4 5 1]
Current Kappa: 0.7169811320754718
[5 5 1 5 1 5 4 4 5 1]
[4 4 2 4 2 4 5 5 5 1]
Current Kappa: 0.6899224806201549
[4 4 2 4 2 4 4 4 5 2]
[5 5 1 5 1 4 5 5 5 1]
Current Kappa: 0.6363636363636364
[4 4 2 4 2 4 4 2 4 2]
[4 4 1 4 2 4 4 5 5 1]
Current Kappa: 0.4444444444444444
[4 4 2 4 2 5 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.6153846153846154
[5 5 1 5 1 3 4 5 4 1]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.8360655737704918
[1 2 4 2 5 1 2 1 3 4]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: -0.6376811594202898
[5 5 1 5 1 5 4 5 4 1]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.8780487804878049
[4 4 2 4 2 4 4 4 4 2]
[5 4 1 5 1 4 4 5 4 1]
Current Kappa: 0.6296296296296295
[2 2 4 2 4 1 4 1 3 2]
[2 4 4 4 4 2 3 3 3 4]
Current Kappa: 0.28
[4 4 2 4 2 4 4 2 4 2]
[5 4 2 4 2 5 5 4 4 1]
Current Kappa: 0.5901639344262295
[4 4 2 4 2 5 4 4 4 2]
[5 5 1 4 1 5 4 5 4 1]
Current Kappa: 0.6875
[4 4 2 4 2 

0it [00:00, ?it/s]


[4 4 2 4 2 3 4 5 4 2]
Current Kappa: -0.3793103448275863
[2 3 4 2 2 5 3 4 2 2]
[4 5 1 4 2 5 4 5 4 1]
Current Kappa: 0.2432432432432432
[1 2 5 2 4 4 3 4 3 2]
[4 4 2 4 2 4 4 4 3 2]
Current Kappa: -0.34782608695652173
[1 2 5 2 5 4 3 4 3 2]
[3 3 2 3 2 3 3 4 3 3]
Current Kappa: -0.28712871287128716
[1 1 5 1 5 3 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: -0.375
[2 3 4 2 2 4 4 4 2 2]
[5 5 1 5 1 5 5 5 4 1]
Current Kappa: 0.1561181434599156
[4 4 2 4 2 2 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.45378151260504196
[5 5 1 4 2 4 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.7445255474452555
[5 4 1 4 2 5 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.7445255474452555
[5 4 1 4 2 4 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.6992481203007519
[4 4 2 4 2 1 4 4 4 2]
[4 4 2 4 2 4 4 3 5 1]
Current Kappa: 0.562043795620438
[5 5 1 5 1 4 4 4 4 2]
[5 5 1 5 1 5 5 5 5 1]
Current Kappa: 0.8407643312101911
[5 5 1 5 1 4 4 5 5 2]
[5 5 1 4 1 4 4 5 5 1]
Current Kappa: 0.9333333333333333
[4 4 2 4 2 3 4

179it [00:30,  5.84it/s]


In [7]:
# Label Prep
# Hard/Soft Labels
y = process_convq_labels(y, label_type)

Data-type of labels - <class 'list'>
ConvQ Classes Distribution : (Total = 179)
High Quality Conv = 163
Low Quality Conv = 16


In [8]:
#train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=data_split_per, random_state=random_seed)
final_conf_matrix = [[0,0],[0,0]]
final_auc_score = 0.0
final_r_squared = 0.0
final_expl_vari = 0.0

skf = StratifiedKFold(n_splits=splits)
for train_index, test_index in skf.split(X, y):

    # Data Prep
    train_X, test_X  = X[train_index], X[test_index]
    train_y, test_y  = [y[i] for i in train_index], [y[i] for i in test_index]
        
    # Transfor Features
    normaliser = feature_normalize(train_X, method="mean-var")
    # Apply transform to both the training set and the test set.
    train_X = normaliser.transform(train_X)
    test_X  = normaliser.transform(test_X)
    
    # Dimensionality Reduction
    dimension_model = dimension_reduction(train_X, method="pca")
    train_X = dimension_model.transform(train_X)
    test_X  = dimension_model.transform(test_X)
    
    # Feature Selection
    top_features = feature_selection(train_X, train_y, method="anova")
    train_X = select_required_features(train_X, top_features) 
    test_X  = select_required_features(test_X, top_features) 

    # SAMPLING
    train_X, train_y = over_sample_data(train_X, train_y, method="SMOTE", k_neighbors=smote_nn)
    
    print("Train Data -> Features - " + str(train_X.shape) + " and Labels - " + str(len(train_y)))
    print("Test  Data -> Features - " + str(test_X.shape) + " and Labels - " + str(len(test_y)))
    print(str(sum(train_y))) 
    
    # Modelling
    model = model_convq_manifestation(train_X, train_y, model_type)
    #Predict
    predict_y = test_model(test_X, model)        
    # Evaluate
    conf_matrix = evaluate_predict(test_y, predict_y, confusion_matrix)
    auc_score = evaluate_predict(test_y, predict_y, roc_auc_score)
#     r_squared = evaluate_predict(test_y, predict_y, r2_score)
#     expl_vari = evaluate_predict(test_y, predict_y, explained_variance_score)
    
    #Update Cross Validated scores
    final_conf_matrix = final_conf_matrix + conf_matrix
    final_auc_score = final_auc_score + auc_score
#     final_r_squared = final_r_squared + r_squared
#     final_expl_vari = final_expl_vari + expl_vari
    
final_auc_score = final_auc_score/skf.get_n_splits(X, y)
# final_r_squared = final_r_squared/skf.get_n_splits(X, y)
# final_expl_vari = final_expl_vari/skf.get_n_splits(X, y)

# Top Features = 9
Train Data -> Features - (260, 9) and Labels - 260
Test  Data -> Features - (36, 9) and Labels - 36
130
# Top Features = 6
Train Data -> Features - (260, 6) and Labels - 260
Test  Data -> Features - (36, 6) and Labels - 36
130
# Top Features = 9
Train Data -> Features - (260, 9) and Labels - 260
Test  Data -> Features - (36, 9) and Labels - 36
130
# Top Features = 5
Train Data -> Features - (262, 5) and Labels - 262
Test  Data -> Features - (36, 5) and Labels - 36
131
# Top Features = 5
Train Data -> Features - (262, 5) and Labels - 262
Test  Data -> Features - (35, 5) and Labels - 35
131


In [9]:
# Printing Final Score
# print("~~~~~~~~~~~ R^2 Measure ~~~~~~~~~~~")
# print(final_r_squared)
# print("~~~~~~~~~~~ Explained Variance ~~~~~~~~~~~")
# print(final_expl_vari)
print("~~~~~~~~~~~ Confusion Matrix ~~~~~~~~~~~")
print(final_conf_matrix)
print("~~~~~~~~~~~ AUC Score ~~~~~~~~~~~")
print(final_auc_score)

~~~~~~~~~~~ Confusion Matrix ~~~~~~~~~~~
[[  5  28]
 [ 11 135]]
~~~~~~~~~~~ AUC Score ~~~~~~~~~~~
0.5520238095238096
