In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# Load the USEEIO dataset

useeio_df = pd.read_csv('useeio.csv')

# Assuming the dataset has the following columns:
# 'commodity', 'emission_factor', 'reliability_score', 'technological_correlation',
# 'data_collection', 'geographical_correlation', 'temporal_correlation',
# 'emission_factor_with_margin'

# Display initial information about the dataset
print("USEEIO Dataset Info:")
print(useeio_df.info())

# 4.3.1 Correlation Analysis
# Calculate correlations between emission factors and data quality scores
data_quality_columns = ['reliability_score', 'technological_correlation', 'data_collection', 'geographical_correlation', 'temporal_correlation']
correlations = {}

for column in data_quality_columns:
    correlation, _ = pearsonr(useeio_df['emission_factor'], useeio_df[column])
    correlations[column] = correlation

# Plot the correlations
plt.figure(figsize=(10, 6))
sns.barplot(x=list(correlations.keys()), y=list(correlations.values()), palette='viridis')
plt.title('Correlation between Emission Factors and Data Quality Scores')
plt.xlabel('Data Quality Scores')
plt.ylabel('Pearson Correlation Coefficient')
plt.tight_layout()
plt.savefig('correlation_analysis.png')
plt.show()

# 4.3.2 Comparison between emission factors with and without margins
useeio_df['difference'] = useeio_df['emission_factor_with_margin'] - useeio_df['emission_factor']

# Summary statistics
positive_diff = useeio_df[useeio_df['difference'] > 0].shape[0]
no_diff = useeio_df[useeio_df['difference'] == 0].shape[0]
negative_diff = useeio_df[useeio_df['difference'] < 0].shape[0]

print(f"Positive Difference: {positive_diff}")
print(f"No Difference: {no_diff}")
print(f"Negative Difference: {negative_diff}")

# Visualization
plt.figure(figsize=(10, 6))
sns.histplot(useeio_df['difference'], bins=20, kde=True)
plt.title('Distribution of Differences between Emission Factors with and without Margins')
plt.xlabel('Difference in Emission Factors')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('emission_factors_comparison.png')
plt.show()

# 4.3.3 Analysis of substance distribution
# Assuming we have columns 'co2_emissions', 'ch4_emissions', 'n2o_emissions', 'other_ghg_emissions'
substances = ['co2_emissions', 'ch4_emissions', 'n2o_emissions', 'other_ghg_emissions']
useeio_df[substances] = useeio_df[substances].fillna(0)  # Fill NaN with 0

# Grouping by commodity and calculating total emissions for each greenhouse gas
total_emissions = useeio_df.groupby('commodity')[substances].sum().reset_index()

# Visualization
total_emissions.set_index('commodity').plot(kind='bar', stacked=True, figsize=(12, 8), colormap='viridis')
plt.title('Distribution of Greenhouse Gas Emissions across Different Commodities')
plt.xlabel('Commodity')
plt.ylabel('Total Emissions')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('substance_distribution.png')
plt.show()

# 4.3.4 Comparison between detail and summary categories
# Assuming we have a 'category' column indicating detail or summary
detailed_df = useeio_df[useeio_df['category'] == 'detail']
summary_df = useeio_df[useeio_df['category'] == 'summary']

# Calculating mean emission factors and data quality scores
detail_means = detailed_df[data_quality_columns + ['emission_factor']].mean()
summary_means = summary_df[data_quality_columns + ['emission_factor']].mean()

# Visualization
mean_df = pd.DataFrame({'Detail': detail_means, 'Summary': summary_means})
mean_df.plot(kind='bar', figsize=(10, 6), colormap='viridis')
plt.title('Comparison of Mean Emission Factors and Data Quality Scores')
plt.xlabel('Metrics')
plt.ylabel('Mean Values')
plt.tight_layout()
plt.savefig('detail_summary_comparison.png')
plt.show()

# 4.3.5 Outlier Detection based on Data Quality Scores
# Standardizing the data quality scores for outlier detection
scaler = StandardScaler()
scaled_data_quality = scaler.fit_transform(useeio_df[data_quality_columns])

# Detecting outliers using z-score method (threshold = 3)
z_scores = np.abs(scaled_data_quality)
outliers = (z_scores > 3).any(axis=1)

# Filtering outliers
outlier_df = useeio_df[outliers]

# Visualization of outliers
plt.figure(figsize=(10, 6))
plt.scatter(useeio_df['reliability_score'], useeio_df['emission_factor'], label='Normal Data', alpha=0.5)
plt.scatter(outlier_df['reliability_score'], outlier_df['emission_factor'], color='red', label='Outliers')
plt.title('Outliers based on Reliability Score and Emission Factors')
plt.xlabel('Reliability Score')
plt.ylabel('Emission Factor')
plt.legend()
plt.tight_layout()
plt.savefig('outlier_detection.png')
plt.show()
