In [80]:
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.formula.api import ols

In [81]:
data = pd.read_csv("data_640_validated.csv", encoding='cp1252', encoding_errors="ignore")


In [82]:
#Only concerened with the socio-demographic profile and enviromental perception columns for task b

columns_to_keep = ['A1_1', 'A1_2', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15']

data = data[columns_to_keep]

In [83]:
##BEFORE AVERAGING, MAKE SURE THAT REVERSE SCALE ARE REVERSED TO MAKE SURE THE AVERAGE ENVIROMENTAL PERCEPTION SCORE IS ACCURATE
##E.G 5 STORNGLY AGREEING TO BELIEIVNG WE ARE REACHING THE LIMIT INDICATES HGIH ENVIROMENTAL PERCEPTION
##### 1 sTRONGLY DISAGREEING TO BELIEVING HUMANS CAN CUT DOWN TREES FOR THIER OWN GOOD ALSO INDICATES HIGH ENVIORMENTAL PERCEPTION.


def mirror_values(val):
    # This function will 'mirror' the values as per the specified transformation
    if val == 1:
        return 5
    elif val == 2:
        return 4
    elif val == 4:
        return 2
    elif val == 5:
        return 1
    else:
        return val  # If the value is 3 or anything else, it stays the same

# List of columns to apply the transformation
columns_to_transform = ['C2', 'C4', 'C6', 'C8', 'C10', 'C12', 'C14']

# Applying the transformation to each specified column
for col in columns_to_transform:
    data[col] = data[col].apply(mirror_values)

In [88]:
#One approach is to average all enviromental perception variable, to get an average

env_perception_columns = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15']
#env_perception_columns = ['C1', 'C3', 'C5', 'C7', 'C9', 'C11', 'C13',  'C15']

data['avg_env_perception'] = data[env_perception_columns].mean(axis=1)

In [None]:
"""
F-Score in ANOVA
The F-score in ANOVA is a statistical measure used to determine whether there are any statistically significant differences between the means of various groups.

Gender Example
Imagine you conducted a study to see if there are differences in environmental perception scores between different genders. You have a dataset with a continuous variable (environmental perception scores) and a categorical variable (gender with categories like male, female, non-binary).

How the F-Score is Calculated
Between-Group Variability (Numerators):

First, for each gender category, the average environmental perception score is calculated.
Then, we see how much each group's mean deviates from the overall mean (regardless of gender).
These deviations are squared, weighted by the number of observations in each gender category, and summed up. This gives the "between-group sum of squares."
Within-Group Variability (Denominators):

Within each gender category, the deviation of each individual's score from their gender group's mean is calculated.
These deviations are squared and summed up across all individuals and all gender groups. This gives the "within-group sum of squares."
The F-Score:

The between-group sum of squares is divided by its degrees of freedom (number of groups minus one).
The within-group sum of squares is divided by its degrees of freedom (total number of observations minus the number of groups).
The F-score is the ratio of these two values (between-group variance to within-group variance).
What the F-Score Tells You
A high F-score suggests that the variability of group means (between males, females, non-binary individuals in this case) is larger than the variability within the groups. This could indicate that gender has a significant effect on environmental perception scores.
A low F-score suggests that most variability in environmental perception scores is within the gender groups rather than between them, indicating that gender may not have a significant effect.





P-Value in ANOVA
The p-value in ANOVA is a statistical measure used to determine whether the differences observed between the means of various groups are statistically significant.

Gender Example
Continuing with your study on environmental perception scores across different genders (male, female, non-binary), after calculating the F-score in ANOVA, the next step is to interpret its significance, which is where the p-value comes in.

Understanding the P-Value
Statistical Significance:

The p-value tells you the probability of observing an F-score as extreme as, or more extreme than, the one you calculated if the null hypothesis is true. In ANOVA, the null hypothesis typically states that all group means (across different genders in this case) are equal.
Interpreting the P-Value:

A small p-value (typically ≤ 0.05) suggests that the differences in group means are statistically significant. This would mean that it's unlikely the observed differences in environmental perception scores across genders happened by chance.
A large p-value indicates that the differences in group means could be due to random chance, suggesting that gender may not have a significant effect on environmental perception scores.
What the P-Value Tells You in the Gender Example
If you get a low p-value (e.g., less than 0.05), it suggests that there are statistically significant differences in environmental perception scores between at least some of the gender groups.
If the p-value is high, it suggests that any differences observed in environmental perception scores across gender groups are not statistically significant and could reasonably occur by random variation in a population where gender has no effect on perception scores.
Important Consideration
The p-value should be interpreted in the context of your study design, sample size, and other factors. For example, very large sample sizes can lead to very small p-values even for minor differences that might not be practically significant.


"""

In [89]:
##Nominal variables
##A1.1
##Nationality


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A1_1)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)


             sum_sq     df         F        PR(>F)
C(A1_1)   50.692994  109.0  3.073798  1.358012e-17
Residual  80.190332  530.0       NaN           NaN


In [90]:
##Nominal variables
##A1.2
##Region


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A1_2)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)


              sum_sq     df         F    PR(>F)
C(A1_2)     0.756548    3.0  1.232553  0.296961
Residual  130.126778  636.0       NaN       NaN


In [91]:
##Nominal variables
##A2
##Gender


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A2)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)


              sum_sq     df          F        PR(>F)
C(A2)       9.753581    1.0  51.372885  2.118717e-12
Residual  121.129746  638.0        NaN           NaN


In [92]:
##Ordinal variables
##A3
##Education level


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A3)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)


              sum_sq     df         F    PR(>F)
C(A3)       0.621204    4.0  0.757059  0.553519
Residual  130.262123  635.0       NaN       NaN


In [98]:
##Nominal variables
##A4
##pet or garden at home


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A4)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)

              sum_sq     df         F    PR(>F)
C(A4)       4.140652    3.0  6.925989  0.000136
Residual  126.742674  636.0       NaN       NaN


In [94]:
##Ordinal variables
##A5
##age
##MIGHT NEED TO GROUP AGES TOGETHER

# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A5)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)


              sum_sq     df         F    PR(>F)
C(A5)       8.916782   38.0  1.156267  0.243394
Residual  121.966544  601.0       NaN       NaN


In [95]:
##Nominal variables
##A6
##Ethnicity


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A6)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)



              sum_sq     df        F    PR(>F)
C(A6)       7.467654    6.0  6.38361  0.000002
Residual  123.415672  633.0      NaN       NaN


In [96]:
##Nominal variables
##A7
##martial status


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A7)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)

              sum_sq     df         F    PR(>F)
C(A7)       3.073790    4.0  3.817902  0.004469
Residual  127.809536  635.0       NaN       NaN


In [97]:
##Nominal variables
##A8
##Employment status


# Set up the ANOVA model
model = ols('avg_env_perception ~ C(A8)', data=data).fit()

# Perform the ANOVA
anova_results = sm.stats.anova_lm(model, typ=2)
print(anova_results)

              sum_sq     df         F    PR(>F)
C(A8)       1.702159    8.0  1.039298  0.404797
Residual  129.181167  631.0       NaN       NaN
