In [1]:
import pandas as pd
import os

# =============================
# PATH CONFIG
# =============================
PIPELINE_FILES = {
    "Raw Zone": {
        "path": r"C:\Users\USER\Desktop\BI-DM Project\data\Raw_zone\raw_data.csv",
        "required_columns": []
    },
    "Staging Zone": {
        "path": r"C:\Users\USER\Desktop\BI-DM Project\data\Staging_zone\staging_data.csv",
        "required_columns": ["country_name", "year", "value"]
    },
    "Cleansing Zone": {
        "path": r"C:\Users\USER\Desktop\BI-DM Project\data\Cleansing_zone\clean_data.csv",
        "required_columns": ["country", "year", "co2"]
    },
    "Presentation Zone": {
        "path": r"C:\Users\USER\Desktop\BI-DM Project\data\presentation_zone\presentation.csv",
        "required_columns": ["country", "year", "total_co2"]
    },
    "Prediction Zone": {
        "path": r"C:\Users\USER\Desktop\BI-DM Project\data\Prediction_zone\test_data.csv",
        "required_columns": ["country", "year", "co2"]
    }
}

# =============================
# TEST FUNCTION
# =============================
def test_pipeline_zone(zone_name, config):
    print(f"\nüîç Testing {zone_name}")
    print("-" * 40)

    path = config["path"]
    required_columns = config["required_columns"]

    # 1Ô∏è‚É£ Check file exists
    if not os.path.exists(path):
        print("‚ùå FAIL: File not found")
        return False

    print("‚úÖ File exists")

    # 2Ô∏è‚É£ Read file
    try:
        df = pd.read_csv(path)
        print("‚úÖ File loaded successfully")
    except Exception as e:
        print("‚ùå FAIL: Cannot read file:", e)
        return False

    # 3Ô∏è‚É£ Check rows
    if len(df) == 0:
        print("‚ùå FAIL: No data rows")
        return False
    else:
        print(f"‚úÖ Rows count: {len(df)}")

    # 4Ô∏è‚É£ Check required columns
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print("‚ùå FAIL: Missing columns:", missing_cols)
        return False
    else:
        if required_columns:
            print("‚úÖ Required columns OK")

    print("üéâ RESULT: PASS")
    return True


# =============================
# RUN ALL TESTS
# =============================
print("\nüöÄ DATA PIPELINE TEST REPORT")
print("=" * 50)

passed = 0
failed = 0

for zone, config in PIPELINE_FILES.items():
    result = test_pipeline_zone(zone, config)
    if result:
        passed += 1
    else:
        failed += 1

# =============================
# SUMMARY
# =============================
print("\nüìä TEST SUMMARY")
print("=" * 50)
print(f"‚úÖ Passed Zones : {passed}")
print(f"‚ùå Failed Zones : {failed}")
print(f"üì¶ Total Zones  : {passed + failed}")

if failed == 0:
    print("\nüéâ ALL DATA PIPELINE TESTS PASSED")
else:
    print("\n‚ö†Ô∏è SOME PIPELINE TESTS FAILED ‚Äî CHECK LOG ABOVE")



üöÄ DATA PIPELINE TEST REPORT

üîç Testing Raw Zone
----------------------------------------
‚úÖ File exists
‚úÖ File loaded successfully
‚úÖ Rows count: 13953
üéâ RESULT: PASS

üîç Testing Staging Zone
----------------------------------------
‚úÖ File exists
‚úÖ File loaded successfully
‚úÖ Rows count: 13953
‚úÖ Required columns OK
üéâ RESULT: PASS

üîç Testing Cleansing Zone
----------------------------------------
‚úÖ File exists
‚úÖ File loaded successfully
‚úÖ Rows count: 13953
‚úÖ Required columns OK
üéâ RESULT: PASS

üîç Testing Presentation Zone
----------------------------------------
‚úÖ File exists
‚úÖ File loaded successfully
‚úÖ Rows count: 13953
‚úÖ Required columns OK
üéâ RESULT: PASS

üîç Testing Prediction Zone
----------------------------------------
‚úÖ File exists
‚úÖ File loaded successfully
‚úÖ Rows count: 2804
‚úÖ Required columns OK
üéâ RESULT: PASS

üìä TEST SUMMARY
‚úÖ Passed Zones : 5
‚ùå Failed Zones : 0
üì¶ Total Zones  : 5

üéâ ALL DATA PIPE

In [9]:
# prediction zone (multi-country)
import pandas as pd
from sklearn.model_selection import train_test_split

# -----------------------------
# 1. Load clean data
# -----------------------------
df = pd.read_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Cleansing_zone\clean_data.csv"
)

# ‡πÉ‡∏ä‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞ field ‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
df = df[["country", "year", "co2"]]

train_list = []
test_list = []

# -----------------------------
# 2. Train / Test split per country
# -----------------------------
for country, group in df.groupby("country"):
    # ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏°‡∏≤‡∏Å‡∏û‡∏≠
    if len(group) < 5:
        continue

    X = group[["year"]]
    y = group["co2"]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42
    )

    train_df = X_train.assign(
        country=country,
        co2=y_train
    )

    test_df = X_test.assign(
        country=country,
        co2=y_test
    )

    train_list.append(train_df)
    test_list.append(test_df)

# -----------------------------
# 3. Combine all countries
# -----------------------------
train_data = pd.concat(train_list).reset_index(drop=True)
test_data = pd.concat(test_list).reset_index(drop=True)

# -----------------------------
# 4. Save to Prediction Zone
# -----------------------------
train_data.to_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Prediction_zone\train_data.csv",
    index=False
)

test_data.to_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Prediction_zone\test_data.csv",
    index=False
)

print("‚úÖ Prediction data prepared for multiple countries")
print(f"üìä Train rows: {len(train_data)}")
print(f"üß™ Test rows: {len(test_data)}")


‚úÖ Prediction data prepared for multiple countries
üìä Train rows: 11146
üß™ Test rows: 2804


In [7]:
#presentation zone
import pandas as pd

df = pd.read_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Cleansing_zone\clean_data.csv"
)

# ‡∏™‡∏£‡∏∏‡∏õ CO‚ÇÇ ‡∏ï‡πà‡∏≠‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏ï‡πà‡∏≠‡∏õ‡∏µ
presentation_df = (
    df.groupby(["country", "year"])["co2"]
    .sum()
    .reset_index()
)

presentation_df.to_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\presentation_zone\presentation.csv",
    index=False
)

print(presentation_df.head())



       country  year      co2
0  Afghanistan  1960  414.371
1  Afghanistan  1961  491.378
2  Afghanistan  1962  689.396
3  Afghanistan  1963  707.731
4  Afghanistan  1964  839.743


In [6]:
#cleansing zone
import pandas as pd

# ‡∏≠‡πà‡∏≤‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏à‡∏≤‡∏Å Staging Zone
df = pd.read_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Staging_zone\staging_data.csv"
)

# ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ä‡∏∑‡πà‡∏≠ column ‡πÉ‡∏´‡πâ‡πÄ‡∏Ç‡πâ‡∏≤‡πÉ‡∏à‡∏á‡πà‡∏≤‡∏¢
df = df.rename(columns={
    "country_name": "country",
    "value": "co2"
})

# ‡∏•‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ã‡πâ‡∏≥
df = df.drop_duplicates()

# ‡∏•‡∏ö‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏°‡∏µ missing
df = df.dropna()

# ‡πÅ‡∏õ‡∏•‡∏á‡∏ä‡∏ô‡∏¥‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•
df["year"] = df["year"].astype(int)
df["co2"] = df["co2"].astype(float)

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å Cleansing Zone
df.to_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Cleansing_zone\clean_data.csv",
    index=False
)

print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13953 entries, 0 to 13952
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   country  13953 non-null  object 
 1   year     13953 non-null  int64  
 2   co2      13953 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 327.2+ KB
None


In [5]:
# Staging Zone
import pandas as pd

# ‡∏≠‡πà‡∏≤‡∏ô‡πÑ‡∏ü‡∏•‡πå raw ‡∏à‡∏≤‡∏Å‡πÄ‡∏Ñ‡∏£‡∏∑‡πà‡∏≠‡∏á
raw_csv = pd.read_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Raw_zone\raw_data.csv"
)

# ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞ column ‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô
cols = ["country_name", "year", "value"]
csv_df = raw_csv[cols]

# ‡∏£‡∏ß‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• (‡∏ï‡∏≠‡∏ô‡∏ô‡∏µ‡πâ‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå‡πÄ‡∏î‡∏µ‡∏¢‡∏ß)
staging_df = pd.concat([csv_df], ignore_index=True)

# ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏õ Staging Zone
staging_df.to_csv(
    r"C:\Users\USER\Desktop\BI-DM Project\data\Staging_zone\staging_data.csv",
    index=False
)

print(staging_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13953 entries, 0 to 13952
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  13953 non-null  object 
 1   year          13953 non-null  int64  
 2   value         13953 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 327.2+ KB
None


In [None]:
#TEST
import pandas as pd

# ‡∏≠‡πà‡∏≤‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏î‡∏¥‡∏ö‡∏à‡∏≤‡∏Å Raw Zone
raw_csv = pd.read_csv(r"C:\Users\USER\Desktop\BI-DM Project\data\Raw_zone\raw_data.csv")


print(raw_csv.head())


  country_code country_name  year      value
0          ABW        Aruba  1960  11092.675
1          ABW        Aruba  1961  11576.719
2          ABW        Aruba  1962  12713.489
3          ABW        Aruba  1963  12178.107
4          ABW        Aruba  1964  11840.743
