In [None]:
# imports
import pandas as pd
import datetime
from datetime import date
import numpy as np

In [None]:
# load data
customers = pd.read_csv('customers_tm1_e.csv')
transactions = pd.read_csv('transactions_tm1_e.csv')

## 1 Brief Look at the data

In [None]:
print(customers.shape)
print(customers.columns)
customers.head()

In [None]:
print(transactions.shape)
print(transactions.columns)
transactions.head()

In [None]:
transactions[transactions['customer_id']==92]

## 2 Creating the Combined Dataset

#### 2.1 Dataframe of Useful Variables

In [None]:
# create new dataset containing relevant columns
df = customers[['customer_id','dob','state','start_balance','creation_date']]

# adding some columns to new dataset

# final transaction date
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date']) # convert datetime
last_transaction = transactions.groupby('customer_id', as_index=False)['transaction_date'].max() # create table of last transaction and customer id
last_transaction.rename(columns={'transaction_date':'final_transaction_date'},inplace=True) # rename
df = df.merge(last_transaction, how='left', on='customer_id') # merge to new df

# final deposit date
transactions['deposit_date'] = pd.to_datetime(transactions['transaction_date']) # convert datetime
last_transaction = transactions[transactions["deposit"] > 0].groupby('customer_id', as_index=False)['deposit_date'].max() # create table of last transaction and customer id
last_transaction.rename(columns={'deposit_date':'final_deposit_date'},inplace=True) # rename
df = df.merge(last_transaction, how='left', on='customer_id') # merge to new df

# first transaction date
first_transaction = transactions.groupby('customer_id', as_index=False)['transaction_date'].min()
first_transaction.rename(columns={'transaction_date':'first_transaction_date'},inplace=True)
df = df.merge(first_transaction, how='left', on='customer_id')

# total deposits
tot_deposits = transactions.groupby('customer_id', as_index=False)['deposit'].sum()
tot_deposits.rename(columns={'deposit':'total_deposits'}, inplace=True)
df = df.merge(tot_deposits, how='left', on='customer_id')

# total withdrawals
tot_withdraws = transactions.groupby('customer_id', as_index=False)['withdrawal'].sum()
tot_withdraws.rename(columns={'withdrawal':'total_withdrawals'}, inplace=True)
df = df.merge(tot_withdraws, how='left', on='customer_id')

# total deposits
num_deposits = transactions.groupby('customer_id', as_index=False)['deposit'].count()
num_deposits.rename(columns={'deposit':'num_of_deposits'}, inplace=True)
df = df.merge(num_deposits, how='left', on='customer_id')

# total withdrawals
num_withdraws = transactions.groupby('customer_id', as_index=False)['withdrawal'].count()
num_withdraws.rename(columns={'withdrawal':'num_of_withdrawals'}, inplace=True)
df = df.merge(num_withdraws, how='left', on='customer_id')

# final balance
df['final_balance'] = df['start_balance'] + df['total_deposits'] + df['total_withdrawals']

# duration open
df['creation_date'] = pd.to_datetime(df['creation_date'])
df['duration_open'] = (df['final_transaction_date'] - df['first_transaction_date'])
df['duration_open'] = df['duration_open'].dt.days

# age on final transaction date
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (((df['final_transaction_date'] - df['dob']).dt.days)/365).apply(np.floor)

# avg deposits
avg_deposits = transactions.groupby('customer_id', as_index=False)['deposit'].mean()
avg_deposits.rename(columns={'deposit':'avg_deposit_val'}, inplace=True)
df = df.merge(avg_deposits, how='left', on='customer_id')

# avg withdrawals
avg_withdrawals = transactions.groupby('customer_id', as_index=False)['withdrawal'].mean()
avg_withdrawals.rename(columns={'withdrawal':'avg_withdrawal_val'}, inplace=True)
df = df.merge(avg_withdrawals, how='left', on='customer_id')

# number of deposits and withdrawals
transactions['deposit_with_nas'] = transactions['deposit'].replace({0:np.nan})
transactions['withdrawal_with_nas'] = transactions['withdrawal'].replace({0:np.nan})
new_df = transactions[['customer_id','deposit_with_nas','withdrawal_with_nas']]
df = df.merge(new_df.groupby('customer_id')['deposit_with_nas'].agg('count'), how='left', on='customer_id')
df = df.merge(new_df.groupby('customer_id')['withdrawal_with_nas'].agg('count'), how='left', on='customer_id')
df.rename(columns={'deposit_with_nas':'num_deposits','withdrawal_with_nas':'num_withdrawals'}, inplace=True)

# get the regions
state_groups = {'Northeast': ['New York', 'Pennsylvania', 'New Jersey', 'Connecticut', 'Massachusetts', 'Rhode Island', 'Maine', 'Vermont', 'New Hampshire'],
                'Midwest': ['Illinois', 'Ohio', 'Michigan', 'Indiana', 'Wisconsin', 'Minnesota', 'Iowa', 'Missouri', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas'],
                'South': ['Texas', 'Florida', 'North Carolina', 'Georgia', 'Virginia', 'Tennessee', 'South Carolina', 'Alabama', 'Louisiana', 'Kentucky', 'Oklahoma', 'Arkansas', 'West Virginia', 'Mississippi'],
                'West': ['California', 'Washington', 'Arizona', 'Colorado', 'Oregon', 'Utah', 'Nevada', 'New Mexico', 'Idaho', 'Montana', 'Wyoming', 'Alaska', 'Hawaii', 'District of Columbia', 'Delaware']}
state_to_region = {}
for region, states in state_groups.items():
    for state in states:
        state_to_region[state] = region
# Apply the mapping to the 'state' column to create a new 'Region' column
df["region"] = df['state'].apply(lambda x: state_to_region[x] if x in state_to_region else 'Other')

In [None]:
df

In [None]:
transactions

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np

# # get the time difference between each deposit
# transaction_df = transactions.sort_values(by=['customer_id', 'transaction_date'])
# transaction_df['transaction_diff'] = (transaction_df[transaction_df['deposit'] > 0].groupby('customer_id')['transaction_date'].diff().dt.days)
# transaction_df = transaction_df[transaction_df['transaction_diff'].notna()]

# transaction_df['avg_transaction_diff'] = (transaction_df[transaction_df['deposit'] > 0].groupby('customer_id')['transaction_date'].diff().dt.days.mean())
# transaction_df = transaction_df[transaction_df['avg_transaction_diff'].notna()]

# # Calculate the number of days in the dataset
# # diff = transaction_df['avg_transaction_diff']
# # print(diff.describe())
# print(transaction_df.head(100))
# # num_days = (diff.max() - diff.min())

# # # Create a histogram with 26 bins (representing 2-week periods)
# # # num_bins = int(num_days / 14)
# # plt.hist(diff, bins=100)
# # plt.scale('log')
# transaction_df

# # # Set the x-axis ticks to the start date of each bin
# # bin_starts = pd.date_range(diff.min())
# # plt.xticks(, rotation=45)

# # # Set the x-axis label to the bin start dates and the y-axis label to the count of days in each bin
# # plt.xlabel('Date (2-week bins)')
# # plt.ylabel('Number of days')
# # plt.title('Histogram of Days in 2-Week Bins')

# # Show the plot
# plt.show()




In [None]:
# get the time difference between each deposit

# get the time difference between each deposit for each customer

In [None]:
transactions[transactions["customer_id"] == 92]

## 2.2 Adding 'Exited' Target Variable

In [None]:
# create exited target variable
end_date = pd.to_datetime('2020-5-31')
exited = [(1 if row < end_date else 0) for row in df['final_transaction_date']]
# (df['churn'] == 1) if (last_customer_date < end_date) else 0
df['exited'] = exited
df['exited'].value_counts()

In [None]:
df.head()

## 3 Cleaning

#### 3.1 Drop Na's

In [None]:
df.isna().sum()
df.dropna(inplace=True)

#### 3.2 Fix States

In [None]:
# fixing states
df['state'].replace(to_replace='TX', value='Texas', inplace=True)
df['state'].replace(to_replace='CALIFORNIA', value='California', inplace=True)
df['state'].replace(to_replace='MASS', value='Massachusetts', inplace=True)
df['state'].replace(to_replace='NY', value='New York', inplace=True)
# drop unidentifiable states
df = df[(df.state != '-999') & (df.state != 'UNK') & (df.state != 'Australia')]

#### 3.3 Drop Useless Columns

In [None]:
cols_to_drop = ['dob','creation_date']
df.drop(columns=cols_to_drop, inplace=True)

In [None]:
df['final_date'] = pd.to_datetime("31/05/2020", dayfirst=True)

# get the time difference between each deposit

transaction_df=transactions
transaction_df = transaction_df[transaction_df['deposit'] > 0]
transaction_df['transaction_diff'] = (transaction_df.groupby('customer_id')['transaction_date'].diff().dt.days)
transaction_df['transaction_diff'] = transaction_df['transaction_diff'].replace(np.nan, 0)

# transaction_df[transaction_df["customer_id"] == 92]

# df_test = df.merge(transaction_df[['customer_id','avg_transaction_diff']], how='left', on='customer_id')
# df_test.describe()
# df_test

# df
# df['churned'] = (((df['final_date'] - df['final_deposit_date']).dt.days > df['avg_transaction_diff']))
# print(df['churned'].head())

# print(df['churned'].value_counts())
df_test = df
new_df3 = transaction_df[['customer_id', 'transaction_diff']]
new_df3 = new_df3.groupby('customer_id')['transaction_diff'].agg('mean')
df_test=df_test.merge(new_df3, how='left', on='customer_id', )
df_test.rename(columns={'transaction_diff':'avg_dep_rate'}, inplace=True)
df_test.describe()



In [None]:
df_test['final_date'] = pd.to_datetime("31/05/2020")
df_test['churned'] = (((df_test['final_date'] - df_test['final_deposit_date']).dt.days > df_test['avg_dep_rate']))

print(df_test['churned'].value_counts())
print(len(df_test))
df_test['churned'].value_counts().to_list()

print(df_test['churned'].value_counts().to_list()[1] / sum(df_test['churned'].value_counts().to_list()))




# The theory behind choosing this is as our churn definition stems from the fact that the bank is interested in savings account users that deposit regularly. 
# IF the average deposit rate for that customer is lower than the length of time elapsed since their last deposit to May 2020 plus some buffer to allow for the benefit of the doubt. 
#


In [None]:
print((25531/(25531+88239)))
print((96718/(96718+17052)))

In [None]:
df_test['time_since_last_deposit'] =df_test['final_date']-df_test['final_deposit_date']
df_test['time_since_last_deposit']= (df_test['time_since_last_deposit'].dt.days)/365


df_test.describe()




# ANY OTHER CLEANING?

#### 3.4 Save Cleaned Dataset

In [None]:
1941/365


#frequency of people whos last transaction date was in that month 


# month 