In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
def data_genr(n):
    p = 15
    n1 = n2 = n // 2
    cov_1 = np.eye(p) + 0.2

    mean_class1 = np.array([3] * p)
    mean_class2 = np.array([2] * p)

    x_class1 = np.random.multivariate_normal(mean_class1, cov_1, n1)
    x_class2 = np.random.multivariate_normal(mean_class2, cov_1, n2)

    x = np.vstack([x_class1, x_class2])
    y = np.repeat([1, 2], [n1, n2])

    df = pd.DataFrame(np.column_stack([x, y]), columns=[f'x{i}' for i in range(1, p+1)] + ['y'])
    return df

generated_dataset = data_genr(100)

In [3]:
generated_dataset

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,y
0,2.049759,1.556235,2.397898,2.903992,3.942776,3.310605,4.037499,3.835274,4.007224,3.135375,1.915932,2.979590,1.867080,1.350044,1.520048,1.0
1,2.808360,2.109678,3.827392,1.887116,2.125454,2.630668,5.551094,2.276143,2.700467,3.360015,3.597011,4.868645,2.520252,2.677242,3.614518,1.0
2,2.961098,4.290155,3.211514,2.329142,2.694602,5.283461,3.091141,4.385692,2.732884,3.335693,3.760156,2.365514,1.980809,2.214951,3.555277,1.0
3,1.122626,2.687332,4.147123,1.341273,1.435214,3.022827,1.714520,4.083020,2.027983,1.483456,2.564017,4.799621,2.635820,2.153083,1.960481,1.0
4,1.068101,3.232945,2.849015,3.308558,3.415500,3.215971,3.032413,1.979007,2.060713,4.016058,4.164741,4.017421,0.725203,1.949126,2.568909,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2.114390,2.008320,2.108734,2.320380,1.804203,3.279793,1.788327,1.043786,2.853053,1.984094,2.824951,3.327156,-0.073229,2.521240,3.536249,2.0
96,1.666302,1.073130,0.261264,3.362309,1.614086,2.877166,0.458971,0.658998,0.114873,1.357567,1.309768,0.731220,1.314167,3.436277,2.507507,2.0
97,4.482355,3.876927,1.294535,2.030604,2.696068,1.988694,2.085538,1.569763,2.506488,2.537408,1.740857,0.800607,3.370707,2.570544,2.199424,2.0
98,2.033983,3.024787,1.680469,1.406459,1.620523,1.535627,0.286010,2.484191,0.510234,3.091351,2.780763,3.207334,2.255786,2.984089,2.489888,2.0


## Generating Data

#### Training Data 50

In [4]:
train_data_50_size = data_genr(50)

#### Training Data 10000


In [5]:
train_data_10000_size = data_genr(10000)

#### Testing Data 10000

In [6]:
test_data_10000_size = data_genr(10000)

## Performance of LDA and Logistic Regression

### Logistic Regression Small Dataset

### Logistic Regression Large Dataset

### LDA Large Dataset

### LDA Small Dataset

## Training and Evaluating the models

### LDA model

In [7]:
def LDA_model_func(train_data, test_data, reps):
    lda_results = []
    for r in range(reps):
        train_data_func = data_genr(train_data)
        X_train = train_data_func.iloc[:, :-1]
        y_train = train_data_func['y']

        test_set_func = data_genr(test_data)
        X_test = test_set_func.iloc[:, :-1]
        y_test = test_set_func['y']
        
        lda_Model = LinearDiscriminantAnalysis()
        lda_Model.fit(X_train, y_train)
        lda_pred_res = lda_Model.predict(X_test)
        lda_accuracy = balanced_accuracy_score(y_test, lda_pred_res)
        lda_results.append(lda_accuracy)
    lda_avg_accuracy = np.mean(lda_results)
    return lda_avg_accuracy

### Logistic Regression

In [8]:
def LoRe_model_func(train_data, test_data, reps):
    LR_results = []
    for r in range(reps):
        train_data_func = data_genr(train_data)
        X_train = train_data_func.iloc[:, :-1]
        y_train = train_data_func['y']

        test_set_func = data_genr(test_data)
        X_test = test_set_func.iloc[:, :-1]
        y_test = test_set_func['y']

        LR_model = LogisticRegression()
        LR_model.fit(X_train, y_train)
        lr_pred_res = LR_model.predict(X_test)
        lr_accuracy = balanced_accuracy_score(y_test, lr_pred_res)
        LR_results.append(lr_accuracy)

    
    LR_avg_accuracy = np.mean(LR_results)

    return LR_avg_accuracy

In [9]:
repetitions = 100
train_data_size_50 = 50
train_data_size_10000 = 10000
test_data_size_10000 = 10000

In [10]:
LDA_avg_size_50 = LDA_model_func(train_data_size_50,test_data_size_10000,repetitions)

LDA_avg_size_10000 = LDA_model_func(train_data_size_10000,test_data_size_10000,repetitions)

In [11]:
LR_avg_size_50 = LoRe_model_func(train_data_size_50,test_data_size_10000,repetitions)

LR_avg_size_10000 = LoRe_model_func(train_data_size_10000,test_data_size_10000,repetitions)

## Results

### LDA

In [12]:
print(f"Average LDA Balanced Accuracy (Training Set Size 50): {LDA_avg_size_50}")
print(f"Average LDA Balanced Accuracy (Training Set Size 10000): {LDA_avg_size_10000}")



Average LDA Balanced Accuracy (Training Set Size 50): 0.7592110000000001
Average LDA Balanced Accuracy (Training Set Size 10000): 0.832385


In [13]:
print(f"Average Logistic Regression Balanced Accuracy (Training Set Size 50): {LR_avg_size_50}")
print(f"Average Logistic Regression Balanced Accuracy (Training Set Size 10000): {LR_avg_size_10000}")

Average Logistic Regression Balanced Accuracy (Training Set Size 50): 0.7848979999999999
Average Logistic Regression Balanced Accuracy (Training Set Size 10000): 0.8334380000000001


## Result analysis

### LR and LDA Training Set Size 50:

### LR and LDA Training Set Size 10000: