# Data Exploration and Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("donors_data.csv")
df.head()

# Column Renaming

In [None]:
df.columns = ["Index", "ID Numer", "Region1", "Region2", "Region3", "Region4", "Is a Homeowner", "Number of Children",
              "Household Income", "Gender", "Wealth Rating", "Average Home Value", "Median Family Income",
              "Average Family Income", "Percentage Low Income", "Number of Promotions", "All Gifts", "Largest Gift",
              "Most Recent Gift", "Num Months Last Donations", "Num Months betwn 1st and 2nd Gift", "Avg. Gift",
              "Donor", "Donated Money"]

In [None]:
df.head()

# Analysis (Visualizing) 

## Average family income of donors

In [None]:
fig = plt.figure(figsize=(16, 8))
df[df["Donor"] == 1]["Average Family Income"].hist(alpha=0.5, label="Donor")
df[df["Donor"] == 0]["Average Family Income"].hist(alpha=0.5, label="Non-Donor")
plt.legend()
plt.title("Histogram of Average Family Income of Donors vs Non-Donors")
plt.tight_layout();

In [None]:
income_donor = df[df["Donor"] == 1]["Average Family Income"].mean()
income_non_donor = df[df["Donor"] == 0]["Average Family Income"].mean()
print("Mean Average Family Income for Donor: \t  ", income_donor)
print("Mean Average Family Income for Non-Donor: ", income_non_donor)

## Question 1 (q1.csv)
- Unit of Analysis: __Average Family Income__
- Comparison: For each type donor, and non-donor, we have what is the mean amount of family income. 
- Analysis: Since the income of donors is higher than that of non-donors, it states that rich people are more likely to donate
- Output: Row 1 is for donors, and 2 is for non donors. Each value indicates the mean average family income

In [None]:
# write to output file
with open("q1.csv", "w") as f:
    f.write("Average income\n")
    f.write(str(income_donor) + "\n")
    f.write(str(income_non_donor) + "\n")

## Total Zonal Donations

In [None]:
fig = plt.figure(figsize=(16, 8))
amount_donated = [df[df["Region1"] == 1]["Donated Money"].sum(),
                    df[df["Region2"] == 1]["Donated Money"].sum(),
                    df[df["Region3"] == 1]["Donated Money"].sum(),
                    df[df["Region4"] == 1]["Donated Money"].sum()]

plt.plot([1, 2, 3, 4], amount_donated, label="Amount Donated in Sum", kind='bar')
plt.legend()
plt.title("Plot of amount donated by Regions")
plt.xlabel("Region")
plt.ylabel("Amount")
plt.tight_layout();

## Question 2 (q2.csv)
- Unit of Analysis: __Amount Donated__
- Comparison: For each type region, 1, 2, 3, and 4, we have what is the sum amount of money donated. 
- Analysis: We observe that region 2 donates least money and region 4 donates maximum. 
- Output: Rows are for regions, each row is one region and corresponding sum of amount donated. 

In [None]:
with open("q2.csv", "w") as f:
    f.write("Region, Sum of Money Donated\n")
    for zone, money in enumerate(amount_donated):
        f.write("{}, {}\n".format(zone, money))