In [1]:
import pandas as pd
import csv
df = pd.read_csv('combined_donor_info.csv')

In [8]:
missing_vals = df.isnull().sum()
print(missing_vals)
# if there is no info on employment, there is also probably no info on unemployment/labor force

print(df.shape)

print(df['Employed'])

Masked Account ID             0
Maked Primary Campaign     1786
Stage                         0
Account Type                  0
Billing Zip/Postal Code     501
                           ... 
Labor Population           1510
Armed Forces               1510
Employed                   1510
Unemployed                 1510
Not in Labor Force         1510
Length: 87, dtype: int64
(6290, 87)
0       14080.0
1       14119.0
2       14444.0
3       14080.0
4       14119.0
         ...   
6285        NaN
6286        NaN
6287    19684.0
6288    18398.0
6289    18305.0
Name: Employed, Length: 6290, dtype: float64


In [9]:
cols = df.columns.tolist()
print(cols)

['Masked Account ID', 'Maked Primary Campaign', 'Stage', 'Account Type', 'Billing Zip/Postal Code', 'Fiscal Period', 'Close Date', 'Amount', 'ZCTA', 'Year', 'Pop', 'Households', 'HH Income', 'Education Years', 'Minority %', 'Poor %', 'Poor Family %', 'Car or Van %', 'Public Transport %', 'Health Ins %', 'No Health Ins %', 'Unemployed %', 'Employed %', 'Armed Forces %', 'Not in Labor Force %', 'Under 10 %', 'Under 35 %', 'Over 65 %', 'Over 85 %', 'High school %', 'College %', '0-50k %', '50-100k %', '100-150k %', '150-200k %', '200k+ %', 'Minority', 'Under 10', 'Under 35', 'Over 65', 'Over 85', 'Male 10-14', 'Male 15-17', 'Male 18-19', 'Male 20', 'Male 21', 'Male 22-24', 'Male 25-29', 'Male 30-34', 'Female 10-14', 'Female 15-17', 'Female 18-19', 'Female 20', 'Female 21', 'Female 22-24', 'Female 25-29', 'Female 30-34', 'Ed Pop', 'Less than High School', 'High school', 'GED', 'Some college to Associates Degree', "Bachelor's Degree or Higher", '$0-25k', '$25-50k', '$50-75k', '$75-100k', '$

In [12]:
# view dates in csv as datetime objects to compare
df['Close Date'] = pd.to_datetime(df['Close Date'], format='%m/%d/%Y')
new_df = df.sort_values('Close Date').groupby('Masked Account ID', as_index=False).first()
donation_counts = df.groupby('Masked Account ID').size().reset_index(name='Num Donations')
new_df = pd.merge(new_df, donation_counts, on='Masked Account ID')
new_df['Repeat Donor'] = new_df['Num Donations'] > 1 # boolean value for whether donor is repeat donor
# down the road, we can choose between having Num Donations OR Repeat Donor as label

print(new_df)

# essentially, we now have data based on the individual, and can see how many times each donor donated
new_df.to_csv('whether_donor_repeats_donation.csv', index=False)



      Masked Account ID Maked Primary Campaign             Stage Account Type  \
0                     1      Campaign Two 2018        Closed Won    Household   
1                     2      Campaign One 2014        Closed Won    Household   
2                     3      Campaign One 2014        Closed Won    Household   
3                     4      Campaign One 2014        Closed Won    Household   
4                     5                   None        Closed Won    Household   
...                 ...                    ...               ...          ...   
1316               1317      Campaign One 2015  In-Kind Received    Household   
1317               1318      Campaign One 2015  In-Kind Received    Corporate   
1318               1319      Campaign One 2015        Closed Won    Corporate   
1319               1320      Campaign One 2014        Closed Won    Household   
1320               1321      Campaign Two 2023        Closed Won    Household   

     Billing Zip/Postal Cod

In [22]:
new_df = new_df.select_dtypes(exclude=['object']) # string columns like Campaign type are not going to be overly helpful
new_df['Repeat Donor'] = new_df['Repeat Donor'].astype(int)

corr_matrix = new_df.corr()
repeat_donor_correlation = corr_matrix['Repeat Donor']
print(repeat_donor_correlation)

high_correlation = repeat_donor_correlation[(repeat_donor_correlation > 0.1) | (repeat_donor_correlation < -0.1)]
print("High correlations:")
print(high_correlation)

# features with high correlation with 'Repeat Donor' label: ['Close Date', 'Households', 'HH Income', 'Education Years', 'Over 65', 'Over 85',
#'Ed Pop', 'Some college to Associates Degree', 'Poverty Pop', 'Health Ins']

Masked Account ID    -0.250265
Close Date           -0.147528
Amount                0.078787
ZCTA                  0.009144
Year                 -0.061064
                        ...   
Employed              0.095271
Unemployed            0.075647
Not in Labor Force    0.094419
Num Donations         0.320464
Repeat Donor          1.000000
Name: Repeat Donor, Length: 62, dtype: float64
High correlations:
Masked Account ID                   -0.250265
Close Date                          -0.147528
Households                           0.104016
HH Income                            0.240272
Education Years                      0.387475
Over 65                              0.117201
Over 85                              0.104154
Ed Pop                               0.102008
Some college to Associates Degree    0.106064
Poverty Pop                          0.104016
Health Ins                           0.103581
Num Donations                        0.320464
Repeat Donor                         1.00

In [23]:
# rinse and repeat for Num Donations label
corr_matrix = new_df.corr()
repeat_donor_correlation = corr_matrix['Num Donations']
print(repeat_donor_correlation)

high_correlation = repeat_donor_correlation[(repeat_donor_correlation > 0.1) | (repeat_donor_correlation < -0.1)]
print("High correlations:")
print(high_correlation)

# results from this don't seem as promising lol, but features with high correlation with 'Num Donations':
# ['Close Date', 'Amount']

Masked Account ID    -0.257296
Close Date           -0.180480
Amount                0.179682
ZCTA                  0.020222
Year                 -0.037523
                        ...   
Employed             -0.039901
Unemployed           -0.064922
Not in Labor Force   -0.040596
Num Donations         1.000000
Repeat Donor          0.320464
Name: Num Donations, Length: 62, dtype: float64
High correlations:
Masked Account ID   -0.257296
Close Date          -0.180480
Amount               0.179682
Num Donations        1.000000
Repeat Donor         0.320464
Name: Num Donations, dtype: float64


In [24]:
# i have (unofficially) made the decision to move forward with 'Repeat Donor' as the label

# dropping all columns that are not (or the label): ['Close Date', 'Households', 'HH Income', 'Education Years', 'Over 65', 'Over 85',
#'Ed Pop', 'Some college to Associates Degree', 'Poverty Pop', 'Health Ins']

columns_to_keep = [
    'Close Date', 
    'Households', 
    'HH Income', 
    'Education Years', 
    'Over 65', 
    'Over 85', 
    'Ed Pop', 
    'Some college to Associates Degree', 
    'Poverty Pop', 
    'Health Ins', 
    'Repeat Donor'
]

filt_df = new_df.loc[:, columns_to_keep]

# some issues: um actually the demographics information was provided as raw data and we should find the percentage of people over 65/85 for ex
# in case it's just a very big county but uh i am tired so i won't do that

In [25]:
print(filt_df.dropna().shape[0]) # num rows with no missing vals (591 lol) (not bad considering we started with 1000)

591


In [27]:
filt_df = filt_df.apply(lambda col: col.fillna(col.mean()), axis=0)
filt_df.to_csv('filt_repeat_donor.csv', index=False)
print(filt_df.dropna().shape[0]) # all fixed!

1321
