In [None]:
#import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import itertools

#import dataset
data = pd.read_csv('pid-5M.csv')

In [None]:
#Data vis 1: plot correlation grid (on the raw data)
data.dataframeName = 'pid-5M.csv'

def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

plotCorrelationMatrix(data,8)

In [None]:
#Data vis 2: scatter plots for each feature against other features

def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()
    
plotScatterMatrix(data,20,10)

In [21]:
#split up data 

D = np.array(data)
X = D[:,1:7]
y = D[:,0]

#different modified data sets with training and testing data for each modification  --------------------------------

# A) unmodified data set
def datasetA():
    return X, y

# B) data set with all observations with zero values for inner or outer energy removed  -----------------------------
def datasetB():
    filterIx = (X[:,4] != 0) & (X[:,5] != 0)
    return X[filterIx], y[filterIx]
    
# C) data set with all outer and inner energy features removed      -------------------------------------------
def datasetC():
    return X[:,0:4], y
    
# D) data set with zero values for outer and inner energy replaced with label averages for inner and outer energy
def datasetD():
    labels = [211., 321., -11., 2212.]
    for label in labels:
        label_features = D[y == label, 5:7]
        label_averages = np.sum(label_features, axis=0)/label_features.shape[0]
        D[np.where(D[:,5]==0), 5] = label_averages[0]
        D[np.where(D[:,6]==0), 6] = label_averages[1]
    return D[:,1:-1], y
        
# E) PCA     -------------------------------------------------------------------------------------------
def datasetE():
    from sklearn.decomposition import PCA
    return PCA(svd_solver='full', n_components='mle').fit_transform(X), y

In [22]:

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# returns (accuracy score, # classes, running time)
def run_model(model, dataset):
    model_name = type(model).__name__
    model_count = dataset[0]
    from time import perf_counter
    tick = perf_counter()
    X_train, X_test, y_train, y_test = dataset[1]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tock = perf_counter()
    from sklearn.metrics import accuracy_score
    result = accuracy_score(y_test, y_pred), np.unique(np.array(y_pred)), tock - tick
    print("(" + model_name + " on data set " + str(model_count) + ")", result)
    return result


# returns (model, dataset) matrix of (accuracy score, # classes, running time) results
def run_models(models, datasets):
    print("Running models...")
    from multiprocessing import Pool
    with Pool(processes = 4) as pool:
        return np.array(
            pool.starmap(run_model,[(model, dataset) for model in models for dataset in datasets])
        ).reshape((len(models), len(datasets), 3))
    
    
# Include models
def build_models():
    print("Building models...")
    lg_classifier = LogisticRegression(random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
    nb_classifier = GaussianNB()
    nn_classifier = MLPClassifier(random_state = 0)
    dt_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    rf_classifier = RandomForestClassifier(criterion = 'entropy', random_state = 0)
    return [lg_classifier, nb_classifier, nn_classifier, dt_classifier, rf_classifier]
    
    
def generate_dataset(i, dataset_generator):
    X, y = dataset_generator()
    from sklearn.model_selection import train_test_split
    dataset = X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    while (len(np.unique(y_train)) != 4 or len(np.unique(y_test)) != 4):
        dataset = X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    return i, dataset


# Include datasets (datasetX)
def build_datasets():
    print("Building datasets...")
    return [generate_dataset(i, dataset_generator) for i, dataset_generator in enumerate([datasetA, datasetB, datasetC, datasetD, datasetE])]

# WIP
def render_results(results):
    print("Rendering results...")
    plt.matshow(results)
    plt.show()
    
    
def run():
    models = build_models()
    datasets = build_datasets()
    run_models(models, datasets)
    
    
run()

Building models...
Building datasets...
Running models...




(LogisticRegression on data set 1) (0.9159114634552249, array([ -11.,  211., 2212.]), 60.59097301000293)




(LogisticRegression on data set 0) (0.918399, array([ -11.,  211.,  321., 2212.]), 135.36239122400002)


Process ForkPoolWorker-29:
Process ForkPoolWorker-28:
Traceback (most recent call last):
  File "/Users/josephdenman/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/josephdenman/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/josephdenman/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/Users/josephdenman/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 47, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "<ipython-input-22-034552bf3976>", line 15, in run_model
    model.fit(X_train, y_train)
  File "/Users/josephdenman/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py", line 1606, in fit
    for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
  File "/Users/josephdenman/opt/anaconda3/lib/pytho

KeyboardInterrupt: 