In [1]:
# Baseline Model

# Model - Logistic Reg,
# Features -
#     * All Channels - Raw, Abs, Mag (8)
#     * All Windows  - 1, 3, 5, 10, 15
#     * All Indiv    - Stat - Mean, Variance, Spec - PSD 6 bins
#     * All Pairwise -
#            - Synch - Correl, lag-Correl, MI, mimicry
#            - Convr - Sym.Conv, Asym.Conv, Glob.Conv
#     * All GroupFeat-
#            - Aggreagtion - Min, Max, Mean, Mode, Var
#            -
# Evaluation - Acc, Conf.Matrix, AUC, Precision, Recall,

In [2]:
import sys  
sys.path.insert(0, '/Users/navinlr/Desktop/Thesis/code_base/conversation_quality')

In [3]:
from modeling import dataset_provider as data_gen
import constants

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, accuracy_score, mean_squared_error, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from imblearn import under_sampling 
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

Total Groups = 115


Using TensorFlow backend.


In [4]:
# Variables for baseline
random_seed=20
manifest="indiv"
data_split_per=.40
missing_data_thresh=50.0 #(in percent)
agreeability_thresh=.2
annotators=["Divya", "Nakul"]#, "Swathi"]
only_involved_pairs=True
splits=5

label_type = "hard"
model_type = "log-reg"

dataset=constants.features_dataset_path_v1

In [67]:
# Functions 
def over_sample_data(temp_X, temp_y, method="SMOTE"):
    if method == "SMOTE":
        temp_X, temp_y = SMOTE().fit_resample(temp_X, temp_y)
    return temp_X, temp_y

def feature_normalize(temp_X, method="min-max"):
    # Fit on training set only.
    if method == "min-max":
        normaliser = MinMaxScaler().fit(temp_X)
    elif method == "mean-var":
        normaliser = StandardScaler().fit(temp_X)
    return normaliser
    
def feature_selection(temp_X, temp_y, method="anova"):
    top_features = []
    if method == "anova":
        f_values, p_values = f_classif(temp_X, temp_y)
        top_features=np.where(np.array(p_values) <= 0.05)[0]
#         print(top_features)
        print("# Top Features = " + str(len(top_features)))
    return top_features

def select_required_features(temp_X, required_feats):
    temp_X=temp_X[:,required_feats]
    print("After Feature Selection, Features -> " + str(temp_X.shape))
    return temp_X

def dimension_reduction(temp_X, method="pca"):
    dim_red_model = None
    if method=="pca":
        dim_red_model = PCA(.95).fit(temp_X)
    return dim_red_model
    
def process_convq_labels(y, label_type="soft"):
    print("Data-type of labels - " + str(type(y)))
    if label_type=="soft":
        y=list(np.around(np.array(y),2))
    else:
        y=list(np.where(np.array(y) <= 0.5, 0, 1))
        print("ConvQ Classes Distribution : (Total = "+ str(len(y)) +")")
        print("High Quality Conv = " + str(sum(y)))
        print("Low Quality Conv = " + str(len(y)-sum(y)))
    return y

def model_convq_manifestation(temp_X, temp_y, model="log-reg"):

    if model == "log-reg":
        model = LogisticRegression(solver='lbfgs', max_iter=1000).fit(temp_X, temp_y)
    elif model == "lin-reg":
        model = LinearRegression().fit(temp_X, temp_y)

    return model

def analyse_model_params(model):
    return True

def test_model(temp_X, model):
    return model.predict(temp_X)

def evaluate_predict(predict_temp_y, test_temp_y, method=accuracy_score):
    score = method(test_temp_y, predict_temp_y)
    return score

In [11]:
# Data Read
X, y, ids = data_gen.get_dataset_for_experiment(dataset=dataset,
                                                    manifest=manifest,
                                                    missing_data_thresh=missing_data_thresh,
                                                    agreeability_thresh=agreeability_thresh,
                                                    annotators=annotators, only_involved_pairs=only_involved_pairs)


Generating Dataset for modeling - indiv ConvQ, ...........
Number of Groups (After removing missing data) - 85


1it [00:00,  6.88it/s]

Number of Final Data-points (After removing unreliable annotation data) - 182


182it [00:30,  5.94it/s]


In [19]:
# Label Prep
# Hard/Soft Labels
y = process_convq_labels(y, label_type)

Data-type of labels - <class 'list'>
ConvQ Classes Distribution : (Total = 182)
High Quality Conv = 166
Low Quality Conv = 16


In [74]:
#train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=data_split_per, random_state=random_seed)
final_conf_matrix = [[0,0],[0,0]]
final_auc_score = 0.0

skf = StratifiedKFold(n_splits=splits)
for train_index, test_index in skf.split(X, y):

    # Data Prep
    train_X, test_X  = X[train_index], X[test_index]
    train_y, test_y  = [y[i] for i in train_index], [y[i] for i in test_index]
        
    # Transfor Features
    normaliser = feature_normalize(train_X, method="mean-var")
    # Apply transform to both the training set and the test set.
    train_X = normaliser.transform(train_X)
    test_X  = normaliser.transform(test_X)
    
    # SAMPLING
    train_X, train_y = over_sample_data(train_X, train_y, method="SMOTE")
    
    # Feature Selection
    top_features = feature_selection(train_X, train_y, method="anova")
    train_X = select_required_features(train_X, top_features) 
    test_X  = select_required_features(test_X, top_features) 
    
    # Dimensionality Reduction
    dimension_model = dimension_reduction(train_X, method="pca")
    train_X = dimension_model.transform(train_X)
    test_X  = dimension_model.transform(test_X)
    
    print("Train Data -> Features - " + str(train_X.shape) + " and Labels - " + str(len(train_y)))
    print("Test  Data -> Features - " + str(test_X.shape) + " and Labels - " + str(len(test_y)))
#     print(sum(train_y))  

    # Modelling
    model = model_convq_manifestation(train_X, train_y, model_type)
    #Predict
    predict_y = test_model(test_X, model)        
    # Evaluate
    conf_matrix = evaluate_predict(predict_y, test_y, confusion_matrix)
    auc_score = evaluate_predict(predict_y, test_y, roc_auc_score)
    
    #Update Cross Validated scores
    final_conf_matrix = final_conf_matrix + conf_matrix
    final_auc_score = final_auc_score + auc_score

final_auc_score = final_auc_score/skf.get_n_splits(X, y)

  1294  1301  1338  1339  1340  1341  1342  1343  1350  1357  1394  1395
  1396  1397  1398  1399  5986  5987  5988  5989  5990  5991  6042  6043
  6044  6045  6046  6047  6054  6061  6098  6099  6100  6101  6102  6103
  6110  6117  6154  6155  6156  6157  6158  6159 10746 10747 10748 10749
 10750 10751 10802 10803 10804 10805 10806 10807 10814 10821 10858 10859
 10860 10861 10862 10863 10870 10877 10914 10915 10916 10917 10918 10919
 15450 15451 15452 15453 15454 15455 15462 15469 15482 15483 15484 15485
 15486 15487 15500 15501 15502 15503 15504 15505 15506 15507 15508 15509
 15510 15511 15526 15527 15528 15530 15544 15546 15548 15562 15563 15564
 15565 15566 15567 15574 15581 15582 15583 15584 15586 15600 15602 15604
 15618 15619 15620 15621 15622 15623 15630 15637 15638 15639 15640 15642
 15656 15658 15660 15674 15675 15676 15677 15678 15679 20266 20267 20268
 20269 20270 20271 20322 20323 20324 20325 20326 20327 20334 20341 20378
 20379 20380 20381 20382 20383 20390 20397 20434 20

# Top Features = 10028
After Feature Selection, Features -> (264, 10028)
After Feature Selection, Features -> (37, 10028)
Train Data -> Features - (264, 70) and Labels - 264
Test  Data -> Features - (37, 70) and Labels - 37


  1282  1283  1284  1285  1286  1287  1294  1301  1338  1339  1340  1341
  1342  1343  1350  1357  1394  1395  1396  1397  1398  1399  5930  5931
  5932  5933  5934  5935  5986  5987  5988  5989  5990  5991  6042  6043
  6044  6045  6046  6047  6054  6061  6098  6099  6100  6101  6102  6103
  6110  6117  6154  6155  6156  6157  6158  6159 10690 10691 10692 10693
 10694 10695 10746 10747 10748 10749 10750 10751 10802 10803 10804 10805
 10806 10807 10814 10821 10858 10859 10860 10861 10862 10863 10870 10877
 10914 10915 10916 10917 10918 10919 15450 15451 15452 15453 15454 15455
 15462 15469 15482 15483 15484 15485 15486 15487 15500 15501 15502 15503
 15504 15505 15506 15507 15508 15509 15510 15511 15526 15527 15528 15530
 15544 15546 15548 15562 15563 15564 15565 15566 15567 15574 15581 15582
 15583 15584 15586 15600 15602 15604 15618 15619 15620 15621 15622 15623
 15630 15637 15638 15639 15640 15642 15656 15658 15660 15674 15675 15676
 15677 15678 15679 20210 20211 20212 20213 20214 20

# Top Features = 11149
After Feature Selection, Features -> (266, 11149)
After Feature Selection, Features -> (37, 11149)
Train Data -> Features - (266, 74) and Labels - 266
Test  Data -> Features - (37, 74) and Labels - 37


  1294  1301  1338  1339  1340  1341  1342  1343  1350  1357  1394  1395
  1396  1397  1398  1399  5986  5987  5988  5989  5990  5991  6042  6043
  6044  6045  6046  6047  6054  6061  6098  6099  6100  6101  6102  6103
  6110  6117  6154  6155  6156  6157  6158  6159 10746 10747 10748 10749
 10750 10751 10802 10803 10804 10805 10806 10807 10814 10821 10858 10859
 10860 10861 10862 10863 10870 10877 10914 10915 10916 10917 10918 10919
 15450 15451 15452 15453 15454 15455 15462 15469 15482 15483 15484 15485
 15486 15487 15500 15501 15502 15503 15504 15505 15506 15507 15508 15509
 15510 15511 15518 15525 15526 15527 15528 15529 15530 15531 15538 15539
 15544 15545 15546 15547 15548 15549 15556 15557 15562 15563 15564 15565
 15566 15567 15574 15581 15582 15583 15584 15585 15586 15587 15594 15595
 15600 15601 15602 15603 15604 15605 15612 15613 15618 15619 15620 15621
 15622 15623 15630 15637 15638 15639 15640 15641 15642 15643 15650 15651
 15656 15657 15658 15659 15660 15661 15668 15669 15

# Top Features = 10783
After Feature Selection, Features -> (266, 10783)
After Feature Selection, Features -> (36, 10783)
Train Data -> Features - (266, 72) and Labels - 266
Test  Data -> Features - (36, 72) and Labels - 36


  1294  1301  1338  1339  1340  1341  1342  1343  1350  1357  1394  1395
  1396  1397  1398  1399  5986  5987  5988  5989  5990  5991  6042  6043
  6044  6045  6046  6047  6054  6061  6098  6099  6100  6101  6102  6103
  6110  6117  6154  6155  6156  6157  6158  6159 10746 10747 10748 10749
 10750 10751 10802 10803 10804 10805 10806 10807 10814 10821 10858 10859
 10860 10861 10862 10863 10870 10877 10914 10915 10916 10917 10918 10919
 15450 15451 15452 15453 15454 15455 15462 15469 15470 15471 15472 15473
 15474 15475 15482 15483 15484 15485 15486 15487 15488 15489 15490 15491
 15492 15493 15500 15501 15502 15503 15504 15505 15506 15507 15508 15509
 15510 15511 15526 15527 15528 15530 15544 15546 15548 15562 15563 15564
 15565 15566 15567 15574 15581 15582 15583 15584 15586 15600 15602 15604
 15618 15619 15620 15621 15622 15623 15630 15637 15638 15639 15640 15642
 15656 15658 15660 15674 15675 15676 15677 15678 15679 20266 20267 20268
 20269 20270 20271 20322 20323 20324 20325 20326 20

# Top Features = 11988
After Feature Selection, Features -> (266, 11988)
After Feature Selection, Features -> (36, 11988)
Train Data -> Features - (266, 75) and Labels - 266
Test  Data -> Features - (36, 75) and Labels - 36


  1294  1301  1338  1339  1340  1341  1342  1343  1350  1357  1394  1395
  1396  1397  1398  1399  5986  5987  5988  5989  5990  5991  6042  6043
  6044  6045  6046  6047  6054  6061  6098  6099  6100  6101  6102  6103
  6110  6117  6154  6155  6156  6157  6158  6159 10746 10747 10748 10749
 10750 10751 10802 10803 10804 10805 10806 10807 10814 10821 10858 10859
 10860 10861 10862 10863 10870 10877 10914 10915 10916 10917 10918 10919
 15450 15451 15452 15453 15454 15455 15462 15469 15482 15483 15484 15485
 15486 15487 15500 15501 15502 15503 15504 15505 15506 15507 15508 15509
 15510 15511 15526 15527 15528 15530 15544 15546 15548 15562 15563 15564
 15565 15566 15567 15574 15581 15582 15583 15584 15586 15600 15602 15604
 15618 15619 15620 15621 15622 15623 15630 15637 15638 15639 15640 15642
 15656 15658 15660 15674 15675 15676 15677 15678 15679 20266 20267 20268
 20269 20270 20271 20322 20323 20324 20325 20326 20327 20334 20341 20378
 20379 20380 20381 20382 20383 20390 20397 20434 20

# Top Features = 9845
After Feature Selection, Features -> (266, 9845)
After Feature Selection, Features -> (36, 9845)
Train Data -> Features - (266, 76) and Labels - 266
Test  Data -> Features - (36, 76) and Labels - 36


In [75]:
# Printing Final Score
print("~~~~~~~~~~~ Confusion Matrix ~~~~~~~~~~~")
print(final_conf_matrix)
print("~~~~~~~~~~~ AUC Score ~~~~~~~~~~~")
print(final_auc_score)


~~~~~~~~~~~ Confusion Matrix ~~~~~~~~~~~
[[  2  14]
 [ 20 146]]
~~~~~~~~~~~ AUC Score ~~~~~~~~~~~
0.5063279857397504
