## Generating data using python

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:

def gen_data(n):
    p = 3
    n1 = n2 = n // 2
    cov_1 = np.eye(p) + 0.2
    cov_2 = np.copy(cov_1)
    cov_2[0, 1] = cov_2[1, 0] = cov_2[0, 1] + 0.5

    mean_class1 = np.array([3] * p)
    mean_class2 = np.array([2] * p)

    x_class1 = np.random.multivariate_normal(mean_class1, cov_1, n1)
    x_class2 = np.random.multivariate_normal(mean_class2, cov_2, n2)

    x = np.vstack([x_class1, x_class2])
    y = np.repeat([1, 2], [n1, n2])

    df = pd.DataFrame(np.column_stack([x, y]), columns=[f'x{i}' for i in range(1, p+1)] + ['y'])
    return df



In [3]:
train_set_size_50 = gen_data(50)

train_set_size_10000 = gen_data(10000)

test_set_size_10000 = gen_data(10000)

print("Training Set (Size 50):")
print(train_set_size_50)

print("\nTraining Set (Size 10000):")
print(train_set_size_10000)

print("\nTest Set (Size 10000):")
print(test_set_size_10000)


Training Set (Size 50):
          x1        x2        x3    y
0   2.544141  2.627882  2.481082  1.0
1   2.080806  4.109955  3.776181  1.0
2   5.382596  3.631755  3.296116  1.0
3   2.986124  2.163454  1.668546  1.0
4   1.636518  2.051159  3.207310  1.0
5   5.068398  3.646277  4.319991  1.0
6   3.340007  1.326764  2.154505  1.0
7   4.839580  4.816918  3.528602  1.0
8   1.247073  2.786365  4.292978  1.0
9   4.938724  2.805459  3.408407  1.0
10  3.526816  2.459638  2.454085  1.0
11  3.495527  3.102929  3.364745  1.0
12  2.816547  3.703979  5.286301  1.0
13  2.917732  3.237356  4.574412  1.0
14  2.827822  1.797380  3.751369  1.0
15  3.826023  2.121357  3.734530  1.0
16  2.021079  3.822270  1.516264  1.0
17  3.368200  1.203428  2.949298  1.0
18  4.218971  3.012775  0.678013  1.0
19  2.422512  1.765429  3.763733  1.0
20  4.082331  2.908635  2.649265  1.0
21  2.472447  0.276598  3.037664  1.0
22  4.266401  2.354535  2.967633  1.0
23  3.893766  2.381152  5.079045  1.0
24  1.365087  2.112895  3.

### Performance of models for the training sets

### Small training set

### Large training set

## Building the models

In [4]:
def lda_qda_models(training_size, testing_size, no_of_repetitions):
    Lda_results = []
    Qda_results = []

    for r in range(no_of_repetitions):
        train_set = gen_data(training_size)
        X_train = train_set.iloc[:, :-1]
        y_train = train_set['y']

        test_set = gen_data(testing_size)
        X_test = test_set.iloc[:, :-1]
        y_test = test_set['y']
        
        # LDA model
        model_LDA = LinearDiscriminantAnalysis()
        model_LDA.fit(X_train, y_train)
        model_LDA_pred = model_LDA.predict(X_test)
        Lda_scores = accuracy_score(y_test, model_LDA_pred)
        Lda_results.append(Lda_scores)

        # QDA model
        model_QDA = QuadraticDiscriminantAnalysis()
        model_QDA.fit(X_train, y_train)
        model_QDA_pred = model_QDA.predict(X_test)
        Qda_scores = accuracy_score(y_test, model_QDA_pred)
        Qda_results.append(Qda_scores)

    LDA_results_mean = np.mean(Lda_results)
    QDA_results_mean = np.mean(Qda_results)

    return LDA_results_mean, QDA_results_mean



In [5]:
repetitions = 100
train_size_50 = 50
train_size_10000 = 10000
test_size_10000 = 10000

mean_LDA_train_50, mean_QDA_train_50 = lda_qda_models(train_size_50, test_size_10000, repetitions)
mean_LDA_train_10000, mean_QDA_train_10000 = lda_qda_models(train_size_10000, test_size_10000, repetitions)

print(f"Average LDA Accuracy (Training Set Size 50): {mean_LDA_train_50}")
print(f"Average QDA Accuracy (Training Set Size 50): {mean_QDA_train_50}")

print(f"\nAverage LDA Accuracy (Training Set Size 10000): {mean_LDA_train_10000}")
print(f"Average QDA Accuracy (Training Set Size 10000): {mean_QDA_train_10000}")


Average LDA Accuracy (Training Set Size 50): 0.726415
Average QDA Accuracy (Training Set Size 50): 0.723694

Average LDA Accuracy (Training Set Size 10000): 0.7438429999999998
Average QDA Accuracy (Training Set Size 10000): 0.7566740000000002


## Results