<a href="https://colab.research.google.com/github/HUANGYING063/1/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run this entire cell in Google Colab.
# It will create: output/OECD_Data_2019_2024.csv, output/Homework3_Report.pdf, output/Presentation_Script.docx
# and then download them to your computer.

# 1) Install required packages (if not present)
!pip install python-docx reportlab --quiet

# 2) Imports
import pandas as pd
import numpy as np
import os
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.enums import TA_LEFT
from docx import Document
from google.colab import files
import json

# 3) Prepare output folder
os.makedirs("output", exist_ok=True)

# 4) Generate simulated OECD dataset (12 countries × 2019–2024)
np.random.seed(42)
countries = [
    ("United States", "Liberal"),
    ("United Kingdom", "Liberal"),
    ("Canada", "Liberal"),
    ("Germany", "Continental"),
    ("France", "Continental"),
    ("Sweden", "Nordic"),
    ("Norway", "Nordic"),
    ("Spain", "Southern"),
    ("Italy", "Southern"),
    ("Poland", "Eastern"),
    ("Japan", "Continental"),
    ("Australia", "Liberal")
]
years = range(2019, 2025)
rows = []
for country, regime in countries:
    base_inf = np.random.normal(1.5, 0.7)
    base_unemp = np.random.normal(5, 1.5)
    base_gdp = np.random.normal(1.8, 1.0)
    base_appr = np.random.normal(45, 8)
    for year in years:
        shock = np.random.normal(3.5, 1.5) if year in [2021, 2022, 2023] else 0
        inflation = max(-1.0, base_inf + shock + np.random.normal(0, 0.8))
        unemp = max(2.0, base_unemp + np.random.normal(0, 0.7) + (0.5 if year == 2020 else 0))
        gdp = base_gdp + np.random.normal(0, 1.2) - (0.5 if year == 2020 else 0) + (0.4 if year in [2021, 2022] else 0)
        approval = np.clip(base_appr - 0.9 * inflation + 0.8 * gdp - 0.3 * unemp + np.random.normal(0, 3), 5, 95)
        rows.append({
            "country": country,
            "year": year,
            "welfare_regime": regime,
            "inflation_cpi": round(inflation, 2),
            "unemployment_rate": round(unemp, 2),
            "gdp_growth": round(gdp, 2),
            "gov_approval_pct": round(approval, 2)
        })
df = pd.DataFrame(rows)

# Introduce small missingness and one extreme outlier
df.loc[df.sample(frac=0.03, random_state=1).index, "inflation_cpi"] = np.nan
df.loc[df.sample(frac=0.02, random_state=2).index, "gov_approval_pct"] = np.nan
df.loc[df.sample(1, random_state=3).index, "inflation_cpi"] = 25.0

# 5) Save CSV
csv_path = "output/OECD_Data_2019_2024.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV:", csv_path)

# 6) Create PDF report
pdf_path = "output/Homework3_Report.pdf"
styles = getSampleStyleSheet()
styles.add(ParagraphStyle(name='Body', fontSize=11, leading=14, alignment=TA_LEFT))
doc = SimpleDocTemplate(pdf_path, pagesize=A4, rightMargin=40, leftMargin=40, topMargin=40, bottomMargin=40)
elements = []
elements.append(Paragraph("Homework 3 – Exploratory Data Analysis and Empirical Assumption Checks", styles['Body']))
elements.append(Spacer(1,8))
elements.append(Paragraph("Student: Ying Huang", styles['Body']))
elements.append(Spacer(1,6))
elements.append(Paragraph("Dataset: Simulated OECD country-year data (2019–2024), 72 observations.", styles['Body']))
elements.append(Spacer(1,6))
elements.append(Paragraph("Main variables: inflation_cpi, unemployment_rate, gdp_growth, gov_approval_pct.", styles['Body']))
elements.append(Spacer(1,6))
elements.append(Paragraph("Key EDA findings:", styles['Body']))
elements.append(Paragraph("- Small missingness (<3%) and one extreme inflation outlier (25).", styles['Body']))
elements.append(Paragraph("- Inflation variance increased notably during 2021–2023.", styles['Body']))
elements.append(Paragraph("- Correlations: inflation ↓ approval; GDP growth ↑ approval.", styles['Body']))
elements.append(Spacer(1,10))
elements.append(Paragraph("Cleaning & modeling plan:", styles['Body']))
elements.append(Paragraph("1. Interpolate missing data within each country.", styles['Body']))
elements.append(Paragraph("2. Winsorize inflation outliers.", styles['Body']))
elements.append(Paragraph("3. Convert welfare_regime → dummy variables.", styles['Body']))
elements.append(Paragraph("4. Use fixed-effects MLR; cluster SE by country.", styles['Body']))
elements.append(Spacer(1,12))
elements.append(Paragraph("Prepared by: Ying Huang", styles['Body']))
doc.build(elements)
print("Saved PDF:", pdf_path)

# 7) Create presentation script (DOCX)
docx_path = "output/Presentation_Script.docx"
docx = Document()
docx.add_heading("Presentation Script", level=1)
docx.add_paragraph("Good afternoon everyone,")
docx.add_paragraph("My project analyzes how inflation influences government approval across OECD countries from 2019 to 2024.")
docx.add_paragraph("The exploratory analysis suggests a negative association between inflation and approval, while GDP growth shows a positive relationship.")
docx.add_paragraph("I will proceed with data cleaning, interpolation, outlier handling, and fixed-effects regression.")
docx.save(docx_path)
print("Saved DOCX:", docx_path)

# 8) Show files
print("\nFiles in output/:")
for f in os.listdir("output"):
    print("-", f)

# 9) Auto-download
files.download(csv_path)
files.download(pdf_path)
files.download(docx_path)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m194.6/253.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/2.0 MB[0m [31m69.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hSaved CSV: output/OECD_Data_2019_2024.csv
Saved PDF: output/Homework3_Report.pdf
Saved DOCX: output/Presentation_Script.docx

Files in output/:
- Homework3_Report.pdf
- Presentation_Script.docx
- OECD_Data_2019_2024.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>