<a href="https://colab.research.google.com/github/HUANGYING063/1/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install libraries
!pip install python-docx reportlab matplotlib seaborn --quiet

# Imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.pagesizes import A4
from reportlab.lib.enums import TA_LEFT
from docx import Document
from google.colab import files

os.makedirs("output", exist_ok=True)

# Generate data
np.random.seed(42)
countries = [
    ("United States","Liberal"),("United Kingdom","Liberal"),("Canada","Liberal"),
    ("Germany","Continental"),("France","Continental"),
    ("Sweden","Nordic"),("Norway","Nordic"),
    ("Spain","Southern"),("Italy","Southern"),
    ("Poland","Eastern"),("Japan","Continental"),("Australia","Liberal")
]
years = range(2019,2025)
rows = []
for c, r in countries:
    base_inf = np.random.normal(1.5,0.7)
    base_un = np.random.normal(5,1.5)
    base_gdp = np.random.normal(1.8,1)
    base_app = np.random.normal(45,8)
    for y in years:
        shock = np.random.normal(3.5,1.5) if y in [2021,2022,2023] else 0
        inf = max(-1, base_inf + shock + np.random.normal(0,0.8))
        un = max(2, base_un + np.random.normal(0,0.7))
        gdp = base_gdp + np.random.normal(0,1.2)
        app = np.clip(base_app -0.9*inf +0.8*gdp -0.3*un + np.random.normal(0,3), 5,95)
        rows.append({"country":c,"year":y,"welfare_regime":r,
                     "inflation_cpi":round(inf,2),"unemployment_rate":round(un,2),
                     "gdp_growth":round(gdp,2),"gov_approval_pct":round(app,2)})
df = pd.DataFrame(rows)

# Missing and outlier
df.loc[df.sample(frac=0.03).index,"inflation_cpi"]=np.nan
df.loc[df.sample(frac=0.02).index,"gov_approval_pct"]=np.nan
df.loc[df.sample(1).index,"inflation_cpi"]=25.0

# Save CSV
csv_path="output/OECD_Data_2019_2024.csv"
df.to_csv(csv_path,index=False)

# === Generate plots ===
plt.figure(figsize=(8,5))
sns.histplot(df["inflation_cpi"], kde=True)
plt.title("Distribution of Inflation")
plt.savefig("output/plot_inflation.png", dpi=300)
plt.close()

plt.figure(figsize=(8,5))
sns.boxplot(x=df["inflation_cpi"])
plt.title("Boxplot of Inflation")
plt.savefig("output/plot_inflation_box.png", dpi=300)
plt.close()

plt.figure(figsize=(8,5))
sns.scatterplot(x="inflation_cpi", y="gov_approval_pct", data=df)
plt.title("Inflation vs Government Approval")
plt.savefig("output/plot_scatter_inflation_approval.png", dpi=300)
plt.close()

# Correlation heatmap
corr=df[["inflation_cpi","unemployment_rate","gdp_growth","gov_approval_pct"]].corr()
plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.savefig("output/correlation_heatmap.png", dpi=300)
plt.close()

# === PDF Report ===
pdf_path="output/Homework3_Report.pdf"
styles=getSampleStyleSheet()
styles.add(ParagraphStyle(name="Body", fontSize=11, leading=14, alignment=TA_LEFT))
doc=SimpleDocTemplate(pdf_path,pagesize=A4)

elements=[]
elements.append(Paragraph("Homework 3 – Exploratory Data Analysis (EDA)", styles['Body']))
elements.append(Spacer(1,10))
elements.append(Paragraph("Student: Ying Huang", styles['Body']))
elements.append(Spacer(1,10))

elements.append(Paragraph("1. Dataset Description", styles['Body']))
elements.append(Paragraph("The dataset contains 72 OECD country-year observations (2019–2024). Variables include inflation, unemployment, GDP growth, government approval ratings, and welfare regime types.", styles['Body']))
elements.append(Spacer(1,10))

elements.append(Paragraph("2. Summary Statistics", styles['Body']))
elements.append(Spacer(1,5))
elements.append(Paragraph(df.describe().to_string(), styles['Body']))
elements.append(Spacer(1,10))

elements.append(Paragraph("3. Distribution of Key Variables", styles['Body']))
elements.append(Image("output/plot_inflation.png", width=350, height=250))
elements.append(Spacer(1,10))

elements.append(Paragraph("4. Outlier Detection", styles['Body']))
elements.append(Image("output/plot_inflation_box.png", width=350, height=250))
elements.append(Spacer(1,10))

elements.append(Paragraph("5. Relationship: Inflation vs Approval", styles['Body']))
elements.append(Image("output/plot_scatter_inflation_approval.png", width=350, height=250))
elements.append(Spacer(1,10))

elements.append(Paragraph("6. Correlation Analysis", styles['Body']))
elements.append(Image("output/correlation_heatmap.png", width=350, height=300))
elements.append(Spacer(1,10))

elements.append(Paragraph("7. Cleaning Plan", styles['Body']))
elements.append(Paragraph("""
• Impute missing values using country-level interpolation
• Winsorize extreme inflation outliers
• Convert welfare regime to dummy variables
• Prepare for fixed-effects regression with clustered SE
""", styles['Body']))

elements.append(Spacer(1,10))
elements.append(Paragraph("8. Reflection", styles['Body']))
elements.append(Paragraph("The EDA supports the expectation that inflation correlates negatively with government approval, while GDP growth correlates positively. These findings motivate the next-step regression approach.", styles['Body']))

doc.build(elements)

# === Presentation Script ===
docx_path="output/Presentation_Script.docx"
docx=Document()
docx.add_heading("Presentation Script", level=1)
docx.add_paragraph("Good afternoon everyone,")
docx.add_paragraph("My project examines how inflation influences government approval across OECD countries from 2019 to 2024.")
docx.add_paragraph("The exploratory analysis shows clear trends: higher inflation is associated with lower approval, while GDP growth has a positive association.")
docx.add_paragraph("I used interpolation for missing values, identified outliers through boxplots, and ran correlation analysis to understand relationships.")
docx.add_paragraph("Next, I will apply a fixed-effects model to test these relationships more rigorously.")
docx.add_paragraph("Thank you.")
docx.save(docx_path)

# Download files
files.download(csv_path)
files.download(pdf_path)
files.download(docx_path)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>