<a href="https://colab.research.google.com/github/Jaswanth-03/Data_analysis_-univariate-bivariate-multivariate-/blob/main/multivariate_statistical_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# multivariate statistical analysis

# Pearson Correlation (for numerical-numerical variables)
# Point-Biserial Correlation (for numerical-binary variables)
# Spearman Correlation (for ordinal or ranked data)



In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, pointbiserialr, spearmanr

# Assuming 'data' is your dataset
# Let's say 'numerical_column' represents a numerical column
# and 'binary_column' represents a binary (0 or 1) categorical column
# and 'ordinal_column' represents an ordinal categorical column

# Example dataset
data = pd.DataFrame({
    'numerical_column': np.random.randn(100),
    'binary_column': np.random.randint(0, 2, 100),
    'ordinal_column': np.random.randint(1, 4, 100)  # Assuming it has 3 levels
})

# Pearson Correlation (for numerical-numerical variables)
# It measures the linear relationship between two continuous variables.
# It ranges from -1 to +1, where +1 indicates a perfect positive linear relationship,
# -1 indicates a perfect negative linear relationship, and 0 indicates no linear relationship.

pearson_corr, pearson_p_value = pearsonr(data['numerical_column'], data['ordinal_column'])
print("Pearson Correlation Coefficient:", pearson_corr)
print("Pearson p-value:", pearson_p_value)
# Additional info: Assumes the variables are normally distributed and have linear relationship.

# Point-Biserial Correlation (for numerical-binary variables)
# It measures the strength and direction of association between a binary variable and a continuous variable.
# It ranges from -1 to +1, where +1 indicates a perfect positive association,
# -1 indicates a perfect negative association, and 0 indicates no association.

point_biserial_corr, point_biserial_p_value = pointbiserialr(data['numerical_column'], data['binary_column'])
print("Point-Biserial Correlation Coefficient:", point_biserial_corr)
print("Point-Biserial p-value:", point_biserial_p_value)
# Additional info: Assumes the continuous variable is normally distributed.

# Spearman Correlation (for ordinal or ranked data)
# It measures the strength and direction of monotonic association between two variables.
# It does not assume linearity and can handle non-linear relationships.
# It ranges from -1 to +1, similar to Pearson correlation.

spearman_corr, spearman_p_value = spearmanr(data['numerical_column'], data['ordinal_column'])
print("Spearman Correlation Coefficient:", spearman_corr)
print("Spearman p-value:", spearman_p_value)
# Additional info: Appropriate for ordinal or ranked data, less sensitive to outliers.


Pearson Correlation Coefficient: -0.09021640798412515
Pearson p-value: 0.3720480081988198
Point-Biserial Correlation Coefficient: 0.008165416980407094
Point-Biserial p-value: 0.9357370760726117
Spearman Correlation Coefficient: -0.05652505793308735
Spearman p-value: 0.5764409659091764


# One-Way ANOVA (for one categorical independent variable)


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway

# Example dataset
data = pd.DataFrame({
    'categorical_column_1': np.random.choice(['A', 'B', 'C'], 100),
    'categorical_column_2': np.random.choice(['X', 'Y'], 100),
    'numerical_column': np.random.randn(100),
})

# One-Way ANOVA (for one categorical independent variable)
# It tests whether the means of a numerical variable differ across levels of a categorical variable.
# It's used to determine if there are any statistically significant differences between the means of three or more independent (unrelated) groups.
# It's often used in experimental research to compare means across different treatment groups.

# Example of One-Way ANOVA
# Here, 'numerical_column' is the dependent variable and 'categorical_column_1' is the independent variable.
# We're testing if the mean of 'numerical_column' differs across levels of 'categorical_column_1'.
result_one_way_anova = f_oneway(*[data[data['categorical_column_1'] == group]['numerical_column'] for group in data['categorical_column_1'].unique()])
print("One-Way ANOVA F-statistic:", result_one_way_anova.statistic)
print("One-Way ANOVA p-value:", result_one_way_anova.pvalue)
# Additional info: Assumes normal distribution of data and equal variances between groups.

# One-Way ANOVA Conclusion
alpha = 0.05  # significance level
if result_one_way_anova.pvalue < alpha:
    print("Conclusion: Reject the null hypothesis.")
    print("There is sufficient evidence to conclude that there are significant differences in means across groups.")
else:
    print("Conclusion: Fail to reject the null hypothesis.")
    print("There is not enough evidence to conclude that there are significant differences in means across groups.")


One-Way ANOVA F-statistic: 0.6293970556434174
One-Way ANOVA p-value: 0.5350751240705005
Conclusion: Fail to reject the null hypothesis.
There is not enough evidence to conclude that there are significant differences in means across groups.


# Two-Way ANOVA

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Example dataset
data = pd.DataFrame({
    'categorical_column_1': np.random.choice(['A', 'B', 'C'], 100),
    'categorical_column_2': np.random.choice(['X', 'Y'], 100),
    'numerical_column': np.random.randn(100),
})

# Two-Way ANOVA (for two categorical independent variables)
# It extends One-Way ANOVA to include the effects of two categorical independent variables on a numerical dependent variable.
# It allows us to test the main effects of each independent variable as well as the interaction effect between them.
# It's used when you have two categorical independent variables and one numerical dependent variable.
# It's commonly used in experimental design to analyze the impact of two factors on an outcome simultaneously.

# Example of Two-Way ANOVA
# Here, 'numerical_column' is the dependent variable, and we're considering the interaction between 'categorical_column_1' and 'categorical_column_2'.
model = ols('numerical_column ~ C(categorical_column_1) * C(categorical_column_2)', data).fit()
anova_table = anova_lm(model, typ=2)
print("\nTwo-Way ANOVA Summary:")
print(anova_table)
# Additional info: Interaction effect tests whether the effect of one independent variable differs across levels of the other independent variable.

# Conclusion from Two-Way ANOVA
alpha = 0.05  # significance level
p_interaction = anova_table.loc['C(categorical_column_1):C(categorical_column_2)', 'PR(>F)']  # p-value for interaction effect
if p_interaction < alpha:
    print("\nConclusion: There is a significant interaction effect between 'categorical_column_1' and 'categorical_column_2'.")
else:
    print("\nConclusion: There is no significant interaction effect between 'categorical_column_1' and 'categorical_column_2'.")



Two-Way ANOVA Summary:
                                                    sum_sq    df         F  \
C(categorical_column_1)                           1.598626   2.0  0.955588   
C(categorical_column_2)                           1.249020   1.0  1.493218   
C(categorical_column_1):C(categorical_column_2)   0.964307   2.0  0.576420   
Residual                                         78.627444  94.0       NaN   

                                                   PR(>F)  
C(categorical_column_1)                          0.388290  
C(categorical_column_2)                          0.224773  
C(categorical_column_1):C(categorical_column_2)  0.563880  
Residual                                              NaN  

Conclusion: There is no significant interaction effect between 'categorical_column_1' and 'categorical_column_2'.


# MANCOVA

In [None]:
import pandas as pd
import numpy as np
from statsmodels.multivariate.manova import MANOVA

# Example dataset
data = pd.DataFrame({
    'categorical_column': np.random.choice(['A', 'B', 'C'], 100),
    'numerical_column_1': np.random.randn(100),
    'numerical_column_2': np.random.randn(100),
    'numerical_column_3': np.random.randn(100),
    'covariate_1': np.random.rand(100),
    'covariate_2': np.random.rand(100)
})

# MANCOVA (Multivariate Analysis of Covariance) if you have multiple dependent variables and continuous covariates
# MANCOVA extends MANOVA by including one or more continuous covariates in the analysis.
# It assesses whether the mean vectors of the multiple dependent variables differ significantly across levels of one or more independent variables, while controlling for the effects of continuous covariates.
# MANCOVA is used when you want to investigate the impact of categorical factors on multiple outcome variables while considering the influence of continuous covariates.
# It's commonly applied in research studies where there is a need to control for potential confounding variables.

# Example of MANCOVA
# Here, 'categorical_column' is the independent variable, 'numerical_column_1', 'numerical_column_2', and 'numerical_column_3' are dependent variables,
# and 'covariate_1' and 'covariate_2' are continuous covariates.
manova_model = MANOVA.from_formula('numerical_column_1 + numerical_column_2 + numerical_column_3 ~ categorical_column + covariate_1 + covariate_2', data=data)
manova_results = manova_model.mv_test()

# Print the MANCOVA summary
print("\nMANCOVA Summary:")
print(manova_results.summary())

# Conclusion from MANOVA
alpha = 0.05  # significance level
p_values = []
for table in manova_results.summary().tables:
    for row in table.data:
        if 'Pr > F' in row:
            p_values.append(float(row[-1]))

if any(pval < alpha for pval in p_values):
    print("\nConclusion: At least one of the dependent variables shows a significant difference across groups.")
else:
    print("\nConclusion: None of the dependent variables show a significant difference across groups.")




MANCOVA Summary:
                  Multivariate linear model
                                                             
-------------------------------------------------------------
        Intercept        Value  Num DF  Den DF F Value Pr > F
-------------------------------------------------------------
           Wilks' lambda 0.9963 3.0000 93.0000  0.1153 0.9509
          Pillai's trace 0.0037 3.0000 93.0000  0.1153 0.9509
  Hotelling-Lawley trace 0.0037 3.0000 93.0000  0.1153 0.9509
     Roy's greatest root 0.0037 3.0000 93.0000  0.1153 0.9509
-------------------------------------------------------------
                                                             
-------------------------------------------------------------
   categorical_column   Value  Num DF  Den DF  F Value Pr > F
-------------------------------------------------------------
          Wilks' lambda 0.9592 6.0000 186.0000  0.6527 0.6879
         Pillai's trace 0.0409 6.0000 188.0000  0.6545 0.6865
 Hotelli

AttributeError: 'DataFrame' object has no attribute 'data'