In [None]:
# 📦 Step 1: Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

sns.set(style="whitegrid")
%matplotlib inline

# 📁 Step 1.5: Create reports folder
os.makedirs("reports", exist_ok=True)

# 📂 Step 2: Load dataset
data_path = Path("synthetic_transactions.csv")
df = pd.read_csv(data_path)

# 🧼 Step 3: Basic cleanup
df["event_time"] = pd.to_datetime(df["event_time"], errors="coerce")
df["amount"] = pd.to_numeric(df["amount"], errors="coerce").fillna(0)
df["is_fraud"] = df["is_fraud"].astype(int)

# 📥 Step 3.5: Save data summary
with open("reports/data_summary.txt", "w", encoding="utf-8") as f:
    f.write("📊 Data Summary\n")
    f.write(f"Total Records: {len(df)}\n")
    f.write(f"Fraud Rate: {df['is_fraud'].mean():.2%}\n")
    f.write(f"Missing Values:\n{df.isnull().sum()[df.isnull().sum() > 0]}\n")

# 📊 Step 4: Data overview
print("Shape:", df.shape)
display(df.head())
display(df.describe())
display(df.info())

# 🔍 Step 5: Missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
fig, ax = plt.subplots()
missing.plot(kind="bar", ax=ax, title="Missing Values by Column", figsize=(10,4))
ax.set_ylabel("Count")
fig.savefig("reports/missing_values.png", bbox_inches="tight")
with open("reports/missing_values_desc.txt", "w", encoding="utf-8") as f:
    f.write("Missing Values by Column\n\nShows which columns have missing data and how much. Useful for data quality checks.")
plt.close(fig)

# ⚖️ Step 6: Fraud distribution
fraud_rate = df["is_fraud"].mean()
fig, ax = plt.subplots()
df["is_fraud"].value_counts(normalize=True).plot(kind="bar", ax=ax)
ax.set_title(f"Fraud vs Non-Fraud (Rate: {fraud_rate:.2%})")
ax.set_xticklabels(["Non-Fraud", "Fraud"], rotation=0)
ax.set_ylabel("Proportion")
fig.savefig("reports/fraud_distribution.png", bbox_inches="tight")
with open("reports/fraud_distribution_desc.txt", "w", encoding="utf-8") as f:
    f.write("Fraud vs Non-Fraud Distribution\n\nShows the proportion of fraudulent vs non-fraudulent transactions. Useful to understand class imbalance.")
plt.close(fig)

# 🕒 Step 7: Time-based patterns
df["hour"] = df["event_time"].dt.hour
df["dayofweek"] = df["event_time"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].isin([5,6]).astype(int)

fig, ax = plt.subplots()
df["hour"].value_counts().sort_index().plot(kind="line", ax=ax, title="Transactions by Hour", figsize=(10,4))
ax.set_xlabel("Hour of Day")
ax.set_ylabel("Transaction Count")
fig.savefig("reports/transactions_by_hour.png", bbox_inches="tight")
with open("reports/transactions_by_hour_desc.txt", "w", encoding="utf-8") as f:
    f.write("Transactions by Hour\n\nShows transaction volume by hour of day. Useful for identifying peak fraud windows.")
plt.close(fig)

# 💸 Step 8: Amount analysis
fig, ax = plt.subplots()
sns.histplot(df["amount"], bins=50, log_scale=True, ax=ax)
ax.set_title("Transaction Amount Distribution")
fig.savefig("reports/amount_distribution.png", bbox_inches="tight")
with open("reports/amount_distribution_desc.txt", "w", encoding="utf-8") as f:
    f.write("Transaction Amount Distribution\n\nShows how transaction amounts are distributed. Log scale helps visualize skewed data.")
plt.close(fig)

fig, ax = plt.subplots()
sns.boxplot(x="is_fraud", y="amount", data=df, ax=ax)
ax.set_title("Amount by Fraud Status")
fig.savefig("reports/amount_by_fraud.png", bbox_inches="tight")
with open("reports/amount_by_fraud_desc.txt", "w", encoding="utf-8") as f:
    f.write("Amount by Fraud Status\n\nCompares transaction amounts between fraud and non-fraud cases.")
plt.close(fig)

# 🧠 Step 9: Behavioral features
for col in ["typing_speed", "nav_speed", "geo_distance_km"]:
    if col in df.columns:
        fig, ax = plt.subplots()
        sns.boxplot(x="is_fraud", y=col, data=df, ax=ax)
        ax.set_title(f"{col} by Fraud Status")
        fig.savefig(f"reports/{col}_by_fraud.png", bbox_inches="tight")
        with open(f"reports/{col}_by_fraud_desc.txt", "w", encoding="utf-8") as f:
            f.write(f"{col} by Fraud Status\n\nShows how {col.replace('_', ' ')} differs between fraud and non-fraud transactions.")
        plt.close(fig)

# 🚀 Step 10: Velocity & device features
for col in ["velocity_1h", "velocity_24h", "unique_devices_30d"]:
    if col in df.columns:
        fig, ax = plt.subplots()
        sns.histplot(df[col], bins=30, ax=ax)
        ax.set_title(f"{col} Distribution")
        fig.savefig(f"reports/{col}_distribution.png", bbox_inches="tight")
        with open(f"reports/{col}_distribution_desc.txt", "w", encoding="utf-8") as f:
            f.write(f"{col} Distribution\n\nShows how {col.replace('_', ' ')} varies across transactions.")
        plt.close(fig)

# 🔍 Step 11: Amount anomaly features
if "amount_over_user_avg" in df.columns:
    fig, ax = plt.subplots()
    sns.boxplot(x="is_fraud", y="amount_over_user_avg", data=df, ax=ax)
    ax.set_title("Amount Over User Avg by Fraud Status")
    fig.savefig("reports/amount_over_user_avg_by_fraud.png", bbox_inches="tight")
    with open("reports/amount_over_user_avg_by_fraud_desc.txt", "w", encoding="utf-8") as f:
        f.write("Amount Over User Avg by Fraud Status\n\nShows how far a transaction deviates from the user's average amount.")
    plt.close(fig)

# 🔗 Step 12: Correlation heatmap
numeric_cols = df.select_dtypes(include=[np.number]).drop(columns=["user_id", "device_id"], errors="ignore")
corr = numeric_cols.corr()
fig, ax = plt.subplots(figsize=(12,8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
ax.set_title("Feature Correlation Heatmap")
fig.savefig("reports/feature_correlation_heatmap.png", bbox_inches="tight")
with open("reports/feature_correlation_heatmap_desc.txt", "w", encoding="utf-8") as f:
    f.write("Feature Correlation Heatmap\n\nShows how numeric features correlate with each other. Useful for spotting redundancy or strong fraud signals.")
plt.close(fig)

# 🧾 Step 13: Summary insights
with open("reports/summary_insights.txt", "w", encoding="utf-8") as f:
    f.write("✅ Exploratory Analysis Complete\n")
    f.write(f"Fraud Rate: {fraud_rate:.2%}\n")
    f.write(f"Missing Columns: {list(missing.index)}\n")
    f.write("Consider engineering more behavioral features or fixing velocity merge issues.\n")

print("✅ All charts and insights saved to /reports folder.")


Shape: (88091, 41)


Unnamed: 0,transaction_id,user_id,event_time,amount,currency,merchant_id,merchant_category,device_id,ip_address,country,...,is_amount_5x_user_avg,unique_devices_30d,velocity_1h_y,velocity_1h,velocity_24h_y,velocity_24h,fraud_risk_score_weighted,log_amount,model_score,selected_flag
0,ec7a3d88-e7b5-4854-b02f-d2d48c2ad8ca,4597,2025-01-01 00:00:00,255.61,INR,38014,crypto,dev-4597,90.92.121.169,MH,...,0,1,1.0,0,1.0,0,5,5.547557,0.959413,1
1,8ba11266-c8cc-4ced-977d-c1cf5709bbd0,1228,2025-01-01 00:01:00,54.04,INR,84938,utilities,dev-1228,92.154.55.191,PW,...,0,1,1.0,0,1.0,0,5,4.00806,0.985558,1
2,84f4eed9-9964-4111-9888-1e468ce9c10a,1922,2025-01-01 00:07:00,52.01,INR,60032,travel,79427b24-492a-4d36-853e-bcb90959c687,3.138.179.14,BO,...,0,1,1.0,0,1.0,0,5,3.970481,0.977797,1
3,02214f04-992f-4c5b-9849-ee01ff286ee0,4259,2025-01-01 00:07:00,519.01,INR,76060,food,dev-4259,81.90.17.198,AM,...,0,1,1.0,0,1.0,0,6,6.253848,0.943175,1
4,c5898a00-9a27-4ba5-8c88-c91c3c4c6031,1130,2025-01-01 00:12:00,616.58,INR,84602,travel,dev-1130,15.62.237.44,GE,...,0,1,1.0,0,1.0,0,5,6.425809,0.916501,1


Unnamed: 0,user_id,event_time,amount,merchant_id,lat,lon,velocity_1h_x,velocity_24h_x,geo_distance_km,behavior_typing_speed_cps,...,is_amount_5x_user_avg,unique_devices_30d,velocity_1h_y,velocity_1h,velocity_24h_y,velocity_24h,fraud_risk_score_weighted,log_amount,model_score,selected_flag
count,88091.0,88091,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,...,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0,88091.0
mean,2499.430089,2025-01-31 07:19:14.047973120,468.51628,54942.687244,27.204391,28.014174,10.074593,10.402777,841.68688,5.01063,...,0.019264,5.86464,1.030258,0.0,1.675977,0.0,5.369357,5.823635,0.95599,1.0
min,1.0,2025-01-01 00:00:00,4.73,10000.0,-0.581271,-98.003541,0.0,0.0,0.0,0.2,...,0.0,1.0,1.0,0.0,1.0,0.0,4.0,1.745716,0.850003,1.0
25%,1247.0,2025-01-16 10:46:00,222.72,32312.5,20.288616,-3.698474,2.0,3.0,18.025,4.185,...,0.0,4.0,1.0,0.0,1.0,0.0,5.0,5.410395,0.945614,1.0
50%,2498.0,2025-01-31 05:13:00,354.39,54949.0,23.38755,53.875045,7.0,8.0,27.052,5.013,...,0.0,6.0,1.0,0.0,1.0,0.0,5.0,5.873216,0.966183,1.0
75%,3747.5,2025-02-15 06:09:30,566.785,77572.0,37.323266,79.26424,15.0,16.0,59.0765,5.838,...,0.0,8.0,1.0,0.0,2.0,0.0,6.0,6.341743,0.979205,1.0
max,5000.0,2025-03-01 23:59:00,14819.66,99998.0,57.213015,105.801547,58.0,59.0,15463.435,10.234,...,1.0,20.0,5.0,0.0,7.0,0.0,8.0,9.603777,0.99788,1.0
std,1444.737415,,441.17049,26045.353256,18.036085,71.594625,9.378102,9.517162,2793.081172,1.23163,...,0.137453,2.901458,0.175772,0.0,0.821271,0.0,0.737704,0.845642,0.033373,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88091 entries, 0 to 88090
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   transaction_id              88091 non-null  object        
 1   user_id                     88091 non-null  int64         
 2   event_time                  88091 non-null  datetime64[ns]
 3   amount                      88091 non-null  float64       
 4   currency                    88091 non-null  object        
 5   merchant_id                 88091 non-null  int64         
 6   merchant_category           88091 non-null  object        
 7   device_id                   88091 non-null  object        
 8   ip_address                  88091 non-null  object        
 9   country                     87650 non-null  object        
 10  lat                         88091 non-null  float64       
 11  lon                         88091 non-null  float64   

None

✅ All charts and insights saved to /reports folder.


In [3]:
from docx import Document
from docx.shared import Inches
from pathlib import Path

# Create a new Word document
doc = Document()
doc.add_heading("Fraud Alert Exploratory Analysis", 0)

# Add data summary
doc.add_heading("📊 Data Summary", level=1)
with open("reports/data_summary.txt", "r", encoding="utf-8") as f:
    doc.add_paragraph(f.read())

# Add each chart and its description
report_files = sorted(Path("reports").glob("*.png"))
for img_path in report_files:
    desc_path = img_path.with_name(img_path.stem + "_desc.txt")
    doc.add_heading(img_path.stem.replace("_", " ").title(), level=2)
    if desc_path.exists():
        with open(desc_path, "r", encoding="utf-8") as f:
            doc.add_paragraph(f.read())
    doc.add_picture(str(img_path), width=Inches(6))

# Add final insights
doc.add_heading("🧾 Summary Insights", level=1)
with open("reports/summary_insights.txt", "r", encoding="utf-8") as f:
    doc.add_paragraph(f.read())

# Save the document
doc.save("Fraud_Analysis_Report.docx")
print("✅ Report saved as Fraud_Analysis_Report.docx")

ModuleNotFoundError: No module named 'docx'