In [2]:
import pandas as pd
import numpy as np
import math

# Load the data
df = pd.read_csv('dea_data.csv') 

In [3]:
# Separate control and treatment groups

control_cols = [col for col in df.columns if'Control' in col]
treatment_cols = [col for col in df.columns if'Treatment' in col]

In [4]:
# Calculate mean expression levels

df['Control Mean'] = df[control_cols].mean(axis=1)
df['Treatment Mean'] = df[treatment_cols].mean(axis=1)

In [5]:
# Compute log2 fold change
# Add a small constant to avoid log(0) issues

epsilon = 1e-10
df['log2FC'] = np.log2((df['Treatment Mean'] + epsilon) / (df['Control Mean'] + epsilon))

In [6]:
# Deterine status based on trushold

df['Status'] = 'Not Significant'
df.loc[df['log2FC'] >= 1, 'Status'] = 'Upregulated'
df.loc[df['log2FC'] <= -1, 'Status'] = 'Downregulated'

In [7]:
# Create the final output DataFrame with required columns

result_df = df[['Gene', 'Control Mean', 'Treatment Mean', 'log2FC', 'Status']]

In [8]:
# Display all results

print("Complete Differential Expression Analysis Results:")
print("=" * 100)
print(f"{'Gene':<10} {'Control Mean':<15} {'Treatment Mean':<15} {'Log2FC':<10} {'Status':<20}")
print("-" * 100)

for _, row in result_df.iterrows():
    print(f"{row['Gene']:<10} {row['Control Mean']:<15.2f} {row['Treatment Mean']:<15.2f} {row['log2FC']:<10.2f} {row['Status']:<20}")

Complete Differential Expression Analysis Results:
Gene       Control Mean    Treatment Mean  Log2FC     Status              
----------------------------------------------------------------------------------------------------
Gene1      53.52           54.88           0.04       Not Significant     
Gene2      47.08           63.28           0.43       Not Significant     
Gene3      45.16           47.92           0.09       Not Significant     
Gene4      51.44           53.56           0.06       Not Significant     
Gene5      48.04           53.24           0.15       Not Significant     
Gene6      51.12           45.72           -0.16      Not Significant     
Gene7      48.36           45.00           -0.10      Not Significant     
Gene8      58.00           41.80           -0.47      Not Significant     
Gene9      57.80           50.56           -0.19      Not Significant     
Gene10     48.20           60.92           0.34       Not Significant     
Gene11     46.88       

In [9]:
# Summary statistics

print("\n" + "=" * 100)
print("SUMMARY STATISTICS:")
print(f"Total genes analyzed: {len(result_df)}")
print(f"Upregulated genes (Log2FC ≥ 1): {sum(result_df['Status'] == 'Upregulated')}")
print(f"Downregulated genes (Log2FC ≤ -1): {sum(result_df['Status'] == 'Downregulated')}")
print(f"Not significant genes: {sum(result_df['Status'] == 'Not significant')}")


SUMMARY STATISTICS:
Total genes analyzed: 100
Upregulated genes (Log2FC ≥ 1): 0
Downregulated genes (Log2FC ≤ -1): 0
Not significant genes: 0


In [10]:
# Show top differentially expressed genes

print("\n" + "=" * 100)
print("TOP 10 UPREGULATED GENES:")
top_upregulated = result_df[result_df['Status'] == 'Upregulated'].sort_values('log2FC', ascending=False).head(10)
print(top_upregulated.to_string(index=False))


TOP 10 UPREGULATED GENES:
Empty DataFrame
Columns: [Gene, Control Mean, Treatment Mean, log2FC, Status]
Index: []


In [11]:
print("\n" + "=" * 100)
print("TOP 10 DOWNREGULATED GENES:")
top_downregulated = result_df[result_df['Status'] == 'Downregulated'].sort_values('log2FC', ascending=True).head(10)
print(top_downregulated.to_string(index=False))


TOP 10 DOWNREGULATED GENES:
Empty DataFrame
Columns: [Gene, Control Mean, Treatment Mean, log2FC, Status]
Index: []


In [12]:
# Export to CSV for further analysis

result_df.to_csv('differential_expression_results.csv', index=False)
print("\nResults have been saved to 'differential_expression_results.csv'")


Results have been saved to 'differential_expression_results.csv'
