In [120]:
#Importing the needed modules
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from scipy import stats
import matplotlib.pyplot as plt

## Linear Discriminant Analysis

In [121]:
# Define the stock symbols
energy_symbols = ['OXY', 'XOM', 'BP','SNPMF','COP', 'E', 'TTE','PCCYF']
tech_symbols = ['META', 'AAPL','AMZN', 'MSFT','GOOG', 'NFLX','IBM']
symbols=energy_symbols+tech_symbols
print(symbols)

['OXY', 'XOM', 'BP', 'SNPMF', 'COP', 'E', 'TTE', 'PCCYF', 'META', 'AAPL', 'AMZN', 'MSFT', 'GOOG', 'NFLX', 'IBM']


In [122]:
# Define the accounting ratios
Current_ratio = [1.042,1.484,1.170,0.859,1.414,1.440,1.168,0.948,2.325,0.982,0.948,1.769,2.172,1.293,1.060]
LTD_TA_ratio = [0.270903,0.109896,0.151833,0.055403,0.172932,0.127321,0.148961,0.083216,0.053428,0.280532,0.145134,0.101923,0.040248,0.295363,0.362998]
ROA = [0.07831,0.10511,0.08393,0.02233,0.14110,0.04705,0.08134,0.05504,0.11693,0.20896,0.02467,0.14245,0.12961,0.07737,0.04107]
ROE = [0.23744,0.27202,0.23392,0.07048,0.26456,0.17247,0.16893,0.10871,0.17356,1.60093,0.08715,0.38824,0.23329,0.21228,0.10396]

In [123]:
#create the dataframe
Sector = ['Energy']*8+['Technology']*7
data = {
    'symbols': symbols,
    'Sector': Sector,
    'Current_ratio': Current_ratio,
    'LTD_TA_ratio': LTD_TA_ratio,
    'ROA': ROA,
    'ROE': ROE
}

df = pd.DataFrame(data)
df


Unnamed: 0,symbols,Sector,Current_ratio,LTD_TA_ratio,ROA,ROE
0,OXY,Energy,1.042,0.270903,0.07831,0.23744
1,XOM,Energy,1.484,0.109896,0.10511,0.27202
2,BP,Energy,1.17,0.151833,0.08393,0.23392
3,SNPMF,Energy,0.859,0.055403,0.02233,0.07048
4,COP,Energy,1.414,0.172932,0.1411,0.26456
5,E,Energy,1.44,0.127321,0.04705,0.17247
6,TTE,Energy,1.168,0.148961,0.08134,0.16893
7,PCCYF,Energy,0.948,0.083216,0.05504,0.10871
8,META,Technology,2.325,0.053428,0.11693,0.17356
9,AAPL,Technology,0.982,0.280532,0.20896,1.60093


In [124]:
#perform Linear Discriminant Analysis
x = df[['Current_ratio', 'ROA', 'LTD_TA_ratio', 'ROE']]
y = df['Sector']

lda = LinearDiscriminantAnalysis()
lda.fit(x,y)

accuracy = lda.score(x, y)
print(f'Accuracy: {accuracy}')
coefficients = lda.coef_
intercept = lda.intercept_
print(f'Coefficients: {coefficients}')
print(f'Intercept: {intercept}')
explained_variance_ratios = lda.explained_variance_ratio_
print("Explained Variance Ratios:", explained_variance_ratios)

Accuracy: 0.8666666666666667
Coefficients: [[  8.46529438 -62.17607682  16.53678746   9.05830524]]
Intercept: [-11.21984219]
Explained Variance Ratios: [1.]


In [125]:
#insert LD1 values and predictions in the dataframe
X_lda = lda.transform(x)
df['LD1'] = X_lda
predictions = lda.predict(x)
df['Predictions']  = predictions
df

Unnamed: 0,symbols,Sector,Current_ratio,LTD_TA_ratio,ROA,ROE,LD1,Predictions
0,OXY,Energy,1.042,0.270903,0.07831,0.23744,0.208453,Energy
1,XOM,Energy,1.484,0.109896,0.10511,0.27202,0.35556,Energy
2,BP,Energy,1.17,0.151833,0.08393,0.23392,0.888675,Energy
3,SNPMF,Energy,0.859,0.055403,0.02233,0.07048,1.896983,Energy
4,COP,Energy,1.414,0.172932,0.1411,0.26456,1.351871,Energy
5,E,Energy,1.44,0.127321,0.04705,0.17247,-1.053365,Technology
6,TTE,Energy,1.168,0.148961,0.08134,0.16893,1.152909,Energy
7,PCCYF,Energy,0.948,0.083216,0.05504,0.10871,2.151575,Energy
8,META,Technology,2.325,0.053428,0.11693,0.17356,-2.092307,Technology
9,AAPL,Technology,0.982,0.280532,0.20896,1.60093,-1.87439,Technology


The code fits a Linear Discriminant Analysis (LDA) model using the Sector as dependent variable and Current Ratio, Long Term Debt on Total Asset, ROA and ROE as predictor. The coefficients represent the weights assigned to each predictor variable
in the creation of LD1. They determine the linear combination of these variables that best separates the
different groups in your dataset. In your specific output:
Positive coefficients (e.g., Current Ratio, Long Term Debt on Total Asset and ROE) indicate that an increase in the corresponding variable’s value favors one group; Negative coefficients (e.g., ROA) indicate that an increase in the variable’s value favors the other group; The magnitude of the coefficient represents the strength of the influence of the variable on LD1.
The accuracy is equal to 0.866667, meaning that 86,67% of 'Sector' variable in the training data can be predict correctly by this model. This is demonstarted also by the 'Predictions' colomun in the dataframe above (13/15 correctly predicted).

Larger magnitude coefficients have a greater impact on the separation of groups.
The variable with the largest absolute coefficient is “ROA” (Return on Assets), with a coefficient of -62.17607682. This suggests that “ROA” has the best discriminating power and the strongest influence on LD1 in discriminating between different groups or classes. This is demonstrated also by the code below.

## Which of the above factors have the best discriminating power?

In [126]:
#factor with the best discriminant power
best_accuracy = 0
best_coefficient = 0
best_factor = None

for factor in x:
    X = df[[factor]]
    lda = LinearDiscriminantAnalysis()
    lda.fit(X, y)
    accuracy = lda.score(X, y)
    coefficient = lda.coef_
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_factor = factor
        best_coefficient = coefficient

print(f"The factor with the best discriminating power is {best_factor} with an accuracy of {best_accuracy},\n with coefficient {best_coefficient}")

The factor with the best discriminating power is ROA with an accuracy of 0.6666666666666666,
 with coefficient [[11.25770127]]


## Which pair of the above factors have the best discriminating power

In [127]:
#pair of factors with the best discriminant power
from itertools import combinations

factor_combinations = list(combinations(['Current_ratio', 'ROA', 'LTD_TA_ratio', 'ROE'], 2))

best_accuracy = 0
best_coefficients = 0
best_factors = None

for factors in factor_combinations:
    X = df[list(factors)]
    lda = LinearDiscriminantAnalysis()
    lda.fit(X, y)
    accuracy = lda.score(X, y)
    coefficients = lda.coef_
    print(f"Pair:{factors}, Discriminating Power: {accuracy}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_factors = factors
        best_coefficients = coefficients

print(f"\nThe pair of factors with the best discriminating power is {best_factors} with an accuracy of {best_accuracy},\n with coefficients {best_coefficients}")

Pair:('Current_ratio', 'ROA'), Discriminating Power: 0.6666666666666666
Pair:('Current_ratio', 'LTD_TA_ratio'), Discriminating Power: 0.8
Pair:('Current_ratio', 'ROE'), Discriminating Power: 0.7333333333333333
Pair:('ROA', 'LTD_TA_ratio'), Discriminating Power: 0.6666666666666666
Pair:('ROA', 'ROE'), Discriminating Power: 0.6666666666666666
Pair:('LTD_TA_ratio', 'ROE'), Discriminating Power: 0.6666666666666666

The pair of factors with the best discriminating power is ('Current_ratio', 'LTD_TA_ratio') with an accuracy of 0.8,
 with coefficients [[ 3.67578441 13.91436265]]


Considering both the direction and magnitude of the coefficients, the pair with the best discriminating power appears to be ('Current_ratio', 'LTD_TA_ratio'). This pair has a strong positive coefficient for “LTD_TA_ratio” and a positive coefficient for “Current_ratio”, indicating a significant influence in discriminating between the groups. The accuracy is equal to 0.8, meaning that 80% of 'Sector' variable in the training data can be predict correctly by this model.