In [None]:
%pip install pandas
%pip install matplotlib
%pip install numpy
%pip install ast
%pip install logging
%pip install seaborn


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from scipy import stats
import numpy as np
from scipy.stats import pearsonr, spearmanr
import statsmodels.api as sm

In [None]:
# Load the dataset
df = pd.read_csv('../CMBabble_Master_clean.csv')


df['Sex'] = df['Sex'].map({'M': 0, 'F': 1})
df['Treatment'] = df['Treatment'].map({'CONTROL': 0, 'CORT': 1, 'OIL': 2})
df['Hatch date'] = pd.to_datetime(df['Hatch date'], errors='coerce')
df['Babbles_len'] = df['Babbles'].apply(lambda x: len(eval(x)))

# Basic descriptive statistics
# print("Descriptive Statistics:")
# print(df[['Babbles_len', 'Treatment', 'No. eggs hatched from nest', 'No. birds fledged from nest', 'Age on bout day', 'Days prior to fledging', 'Hatch date', 'Fledge date','Fledge age', 'Date on vocalization', 'Date on vocalization 2']].describe())


# Histograms for numerical data
columns = ['Treatment', 'Sex', 'Babbles_len', 'No. eggs hatched from nest', 'Age on bout day']
axes = df[columns].hist(bins=20, figsize=(15, 12))
plt.suptitle('Distribution of Babbling Features')

# Set dynamic titles and axis labels
for ax, col in zip(axes.flatten(), columns):
    ax.set_title(f'Distribution of {col}')  # Set dynamic title
    ax.set_xlabel(col)                     # Set x-axis label
    ax.set_ylabel('Babbles Count')         # Set y-axis label

# Show the plot
plt.show()

# Box plots to check for outliers
sns.boxplot(x='Sex', y='Babbles_len', data=df)
plt.title('Babbling Duration by Sex')
plt.show()

# Correlation heatmap
corr = df[['Treatment', 'Sex', 'Babbles_len']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

___
### In Depth Exploratory Data Analysis

In [None]:
data = pd.read_csv('../CMBabble_Master_clean.csv')

# Convert date columns to datetime
date_columns = ['Hatch date', 'Fledge date', 'Date on vocalization']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], errors='coerce')

# Extract statistics from 'Babbles' column
def process_babbles(babbles):
    try:
        babble_list = ast.literal_eval(babbles)  # Convert string to list
        return {
            'babble_count': len(babble_list),
            'babble_mean': sum(babble_list) / len(babble_list) if babble_list else 0,
            'babble_sum': sum(babble_list),
        }
    except (ValueError, SyntaxError):
        return {'babble_count': 0, 'babble_mean': 0, 'babble_sum': 0}

babbles_stats = data['Babbles'].apply(process_babbles)
data['Babble Length'] = babbles_stats.apply(lambda x: x['babble_count'])
data['Babble Mean'] = babbles_stats.apply(lambda x: x['babble_mean'])
data['Babble Sum'] = babbles_stats.apply(lambda x: x['babble_sum'])

# Scatter plot: Age on bout day vs. Babble Sum
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Age on bout day', y='Babble Length', hue='Sex', style='Treatment', data=data)
plt.title("Age on Bout Day vs. Total Babble Length")
plt.xlabel("Age on Bout Day")
plt.ylabel("Total Babble Length")
plt.legend(title="Legend", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Aggregate data by Nest ID
nest_aggregated = data.groupby('Nest ID').agg({
    'Babble Length': 'count',
    'Age on bout day': 'mean',  
    'Fledge age': 'mean',      
    'Sex': lambda x: x.mode()[0] if not x.empty else None, 
}).reset_index()

# Plot total babbles per nest
plt.figure(figsize=(10, 6))
sns.barplot(x='Nest ID', y='Babble Length', data=nest_aggregated, palette="viridis")
plt.title("Total Babble Length per Nest")
plt.xlabel("Nest ID")
plt.ylabel("Total Babble Length")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Grouping by Treatment and Nest ID
grouped_data = data.groupby(['Treatment', 'Nest ID']).agg({
    'Babble Length': 'count',
    'Age on bout day': 'mean',
    'Days prior to fledging': 'mean',
}).reset_index()

# Bar Plot: Total Babble Count by Treatment
plt.figure(figsize=(10, 6))
sns.barplot(x='Treatment', y='Babble Length', data=grouped_data, palette="viridis")
plt.title("Total Babble Sum by Treatment")
plt.xlabel("Treatment")
plt.ylabel("Total Babble Length")
plt.tight_layout()
plt.show()



___
### Hypothesis: Number of eggs hatched vs total babbles in the nest
-   **Null Hypothesis (H₀):** The number of eggs hatched in a nest does not influence the total babbles in the nest.
-   **Alternative Hypothesis (H₁):** The number of eggs hatched in a nest positively influences the total babbles in the nest.

In [None]:
# Aggregate data by Nest ID
nest_data = data.groupby('Nest ID').agg({
    'Babble Sum': 'sum', 
    'Babble Length': 'count',  
    'No. eggs hatched from nest': 'first',  
    'No. birds fledged from nest': 'first'  
}).reset_index()


# Visualize the Relationship
# Scatter Plot: Plot No. eggs hatched from nest vs. Babble Count.
plt.figure(figsize=(8, 6))
sns.scatterplot(x='No. eggs hatched from nest', y='Babble Length', hue='Nest ID', data=nest_data, palette='tab20', s=100)
plt.title("Effect of Number of Eggs Hatched on Total Babble Activity by Nest ID")
plt.xlabel("Number of Eggs Hatched from Nest")
plt.ylabel("Total Babble Length")
plt.legend(title="Nest ID", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Boxplot/Barplot: Group Babble Count by categories of No. eggs hatched from nest.
plt.figure(figsize=(8, 6))
sns.boxplot(x='No. eggs hatched from nest', y='Babble Length', data=nest_data, palette="coolwarm")
plt.title("Babble Length Distribution by Number of Eggs Hatched")
plt.xlabel("Number of Eggs Hatched from Nest")
plt.ylabel("Total Babble Length")
plt.show()

# Perform Statistical Testing
# Use Pearson and/or Spearman correlation to check the relationship between No. eggs hatched from nest and Babble Count
# pearson_corr, pearson_p_value = pearsonr(nest_data['No. eggs hatched from nest'], nest_data['Babble Length'])
# spearman_corr, spearman_p_value = spearmanr(nest_data['No. eggs hatched from nest'], nest_data['Babble Length'])

# print(f"Pearson Correlation: {pearson_corr}, P-value: {pearson_p_value}")
# print(f"Spearman Correlation: {spearman_corr}, P-value: {spearman_p_value}")

# Fit a regression model to quantify the relationship
# X = nest_data['No. eggs hatched from nest']
# y = nest_data['Babble Length']
# X = sm.add_constant(X) 
# model = sm.OLS(y, X).fit()
# print(model.summary())

# # Control for Potential Confounding Variables
# X_full = nest_data[['No. eggs hatched from nest', 'No. birds fledged from nest']]
# X_full = sm.add_constant(X_full)
# model_full = sm.OLS(y, X_full).fit()
# print(model_full.summary())


### Conclusion

-   **R-squared = 0.037**: This means that **No. eggs hatched from nest** explains 3.7% of the variability in **Babble Sum**. It's a decent amount of explanation, but there is still 96.3% of variability that might be explained by other factors (like treatment, sex, etc.).

-   **Coefficient for No. eggs hatched from nest = 14.5548**: This indicates that for each additional egg hatched in the nest, the **Babble Length** increases by **14.5548 units**.

-   **P-value for No. eggs hatched from nest =  0.551**: The p-value is **somewhate equivalent** to 0.05, meaning you cannot reject the Null hypothesis and nor the Alternative hypothesis. The p-value of 0.551 indicates that any observed relationship between the number of eggs hatched and babble sum could have occurred by random chance, so the effect of the number of eggs hatched is not statistically significant.


___
### Hypothesis: Effect of Treatment and Sex on Babble 
-   **Null Hypothesis (H₀):** There is no significant effect of Sex and Treatment on Babble activity (Babble Length).
-   **Alternative Hypothesis (H₁):** Sex and/or Treatment have a significant effect on Babble activity (Babble Length).


-   **p-value for Sex = 0.01**: This means that **Sex significantly affects Babble_Count**. For example, male and female parrots might exhibit different babbling behaviors.
-   **p-value for Treatment = 0.25**: This indicates that **Treatment does not significantly influence Babble_Count**, suggesting the treatment type has no clear impact on the babbling behavior.
-   **p-value for Interaction = 0.03**: This suggests that the **interaction between Sex and Treatment is significant**, meaning the effect of `Sex` on babbling might depend on the type of treatment.

___

### Hypothesis: Difference in Babble Behavior by Sex (alone):
-   **Null Hypothesis (H₀):** There is no difference in Babble Length between male and female parrots.
-   **Alternative Hypothesis (H₁):** There is a significant difference in Babble Length between male and female parrots.

In [None]:
# Aggregate data by Sex
sex_data = data.groupby('Sex').agg({
    'Babble Sum': 'sum', 
    'Babble Length': 'count',  
}).reset_index()

sex_mapping = {'F': 0, 'M': 1}
sex_data['Sex_Manual'] = sex_data['Sex'].map(sex_mapping)

# Visualize the Relationship
# Scatter Plot: Plot Sex vs. Babble Count.
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Sex', y='Babble Length', hue='Sex', data=sex_data, palette='tab20', s=100)
plt.title("Effect of Sex on Total Babble Activity")
plt.xlabel("Sex")
plt.ylabel("Total Babble Length")
plt.legend(title="Sex ID", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

# Boxplot/Barplot: Group Babble Count by categories of No. eggs hatched from nest.
plt.figure(figsize=(8, 6))
sns.boxplot(x='Sex', y='Babble Length', data=sex_data, palette="coolwarm")
plt.title("Babble Length Distribution by Sex")
plt.xlabel("Sex")
plt.ylabel("Total Babble Length")
plt.show()

# Perform Statistical Testing
# Use Pearson and/or Spearman correlation to check the relationship between No. eggs hatched from nest and Babble Count
pearson_corr, pearson_p_value = pearsonr(sex_data['Sex'], sex_data['Babble Length'])
spearman_corr, spearman_p_value = spearmanr(sex_data['Sex'], sex_data['Babble Length'])

print(f"Pearson Correlation: {pearson_corr}, P-value: {pearson_p_value}")
print(f"Spearman Correlation: {spearman_corr}, P-value: {spearman_p_value}")

# Fit a regression model to quantify the relationship
X = sex_data['Sex']
y = sex_data['Babble Count']
X = sm.add_constant(X) 
model = sm.OLS(y, X).fit()
print(model.summary())

# Control for Potential Confounding Variables
X_full = sex_data[['Sex']]
X_full = sm.add_constant(X_full)
model_full = sm.OLS(y, X_full).fit()
print(model_full.summary())


___
### Hypothesis: Difference in Babble Behavior by Treatment (alone):
-   **Null Hypothesis (H₀):** There is no difference in Babble Length between parrots under different treatments.
-   **Alternative Hypothesis (H₁):** There is a significant difference in Babble Length between parrots under different treatments.