In [1]:
'''Main'''
import numpy as np
import pandas as pd

from scipy import interp
from datetime import date, timedelta
from tqdm import tqdm
from inspect import signature

'''Libraries for processing geodata'''
import geopy.distance
from geopy.geocoders import Nominatim

'''Data Visualization'''
import matplotlib.pyplot as plt
%matplotlib inline 

'''Data Prep and Model Evaluation'''
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

In [2]:
transactions = pd.read_csv('transactions.csv')
cc_info = pd.read_csv('cc_info.csv')
transactions.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719


In [3]:
'''Converting datetime data into different formats such as month, day, hour, seconds etc.'''
transactions['date'] = pd.to_datetime(transactions['date'])
transactions['month'] = transactions['date'].dt.month
transactions['year'] = transactions['date'].dt.year
transactions['day'] = transactions['date'].dt.day
transactions['day_of_week'] = transactions['date'].dt.dayofweek
transactions['seconds'] = transactions['date'].dt.second
transactions['hour'] = transactions['date'].dt.hour
transactions['minute'] = transactions['date'].dt.minute

'''Converting to date to str format %m/%d/%Y'''
transactions['formatted_date'] = transactions['date'].dt.strftime('%m/%d/%Y')
transactions.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat,month,year,day,day_of_week,seconds,hour,minute,formatted_date
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,9,2015,11,4,40,0,32,09/11/2015
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,10,2015,24,5,8,22,23,10/24/2015
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,10,2015,26,0,36,18,19,10/26/2015
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,10,2015,22,3,10,19,41,10/22/2015
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,10,2015,26,0,22,20,8,10/26/2015


In [4]:
cc_info.head()

Unnamed: 0,credit_card,city,state,zipcode,credit_card_limit
0,1280981422329509,Dallas,PA,18612,6000
1,9737219864179988,Houston,PA,15342,16000
2,4749889059323202,Auburn,MA,1501,14000
3,9591503562024072,Orlando,WV,26412,18000
4,2095640259001271,New York,NY,10001,20000


# Combine cc_info with transactions

In [5]:
# Merge cc_info with transactions using left merge(because we are only interested in cards with transaction info)
credit_card_total = pd.merge(transactions, cc_info, how = 'left', on = 'credit_card')
credit_card_total .head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat,month,year,day,day_of_week,seconds,hour,minute,formatted_date,city,state,zipcode,credit_card_limit
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,9,2015,11,4,40,0,32,09/11/2015,Houston,PA,15342,20000
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,10,2015,24,5,8,22,23,10/24/2015,Houston,PA,15342,20000
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,10,2015,26,0,36,18,19,10/26/2015,Houston,PA,15342,20000
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,10,2015,22,3,10,19,41,10/22/2015,Houston,PA,15342,20000
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,10,2015,26,0,22,20,8,10/26/2015,Houston,PA,15342,20000


## Problem statement : 
Your boss wants to implement the following products and needs your help.

Your boss wants to identify those users that in the your data set never went above the monthly credit card limit. The goal of this is to automatically increase their limit. Can you send him the list of Ids?

In [6]:
# groupby credit and month to obtain total transaction amounts for that month and cast it in a new column called 
#'credit_card_total'
credit_card_total['card_total_by_month']  = credit_card_total.groupby(['credit_card', 
                                                               'month'])['transaction_dollar_amount'].transform('sum')

# calculate balance remaining at the end of the month by calculating the difference between 
# card_total_by_month and credit_card_total
credit_card_total['balance'] = credit_card_total['credit_card_limit'] - credit_card_total['card_total_by_month']
# Users that went over the credit card limit
users_over_limit = credit_card_total[credit_card_total['balance'] < 0]['credit_card'].unique()
# Users that never went over their credit card limit - difference between total unique credit cards and users 
# under the limit
users_under_limit = [x for x in credit_card_total['credit_card'].unique() if x not in users_over_limit]

# Total unique cards
print('Total unique cards: {} \n'.format(len(credit_card_total['credit_card'].unique())))

print('Total users that went over their credit card limit: {}\n'.format(len(users_over_limit)))

print('Total users that never went over their credit card limit: {}\n'.format(len(users_under_limit)))

Total unique cards: 984 

Total users that went over their credit card limit: 122

Total users that never went over their credit card limit: 862



## Problem Statement 2:
On the other hand, he wants to implement an algorithm that as soon as a user goes above her monthly limit, it triggers an alert so that the user can be notified about that. Build a function that for each day, returns a list of users who went above their credit card monthly limit on that day.

In [7]:
# function that returns list of users going over the limit for every day listed 
# in the transaction dataset in ascending order
def over_limit_alert(df, date_column, balance):
    df.sort_values(by = 'date')
    for i in df['formatted_date'].unique():
        sel_df = df[(df['formatted_date'] == i)
                   &
                   (df['balance'] < 0)]
        print('{} users that went over the limit on {} : {}'.format(len(sel_df['credit_card'].unique()) , i, sel_df['credit_card'].unique()))
print(over_limit_alert(credit_card_total, 'date', 'balance'))

53 users that went over the limit on 09/11/2015 : [1106824181265726 1175962294549313 1190980117697422 1246716439259317
 1460880989446247 2097167243683055 2245942585429940 2302576486327459
 2505223645294729 2542445829224998 3264419298955673 3276369883343727
 3281814060807145 3355576223096097 3369600965634913 3936887050542904
 4052848131106690 4118286032166087 4462290711836916 4564117045739728
 4569281393242605 4572984294472212 4973517790485920 5257380962581683
 5488856737032471 5661819269445876 5784686375395380 5795626689544539
 5975270769354417 6174559182308122 6287151117146988 6766253113444560
 6984795534098127 7059627552446649 7107467078128879 7198750113791865
 7214837915436490 7280963829231048 7299183791723634 7350222978998674
 7499289351166761 7509272878525535 7556827548313098 7850942767136368
 7922818627489943 7943675133681182 8060656990279276 8210265648016159
 8522875529951473 8896425420278012 8972201384562696 9632319271199136
 9999757432802760]
72 users that went over the limit 

69 users that went over the limit on 08/24/2015 : [1106824181265726 1175962294549313 1246716439259317 1280981422329509
 1460880989446247 1833346877787047 2245942585429940 2366928097135853
 2505223645294729 2610112472096585 2756688131944353 2891791194252089
 2980539633198204 3095443081295019 3138132199016625 3264419298955673
 3276369883343727 3355576223096097 3547198874425548 3929517687134990
 3936887050542904 4052848131106690 4063875032497374 4298557099672376
 4572984294472212 4631597686439269 4973517790485920 4993234579335307
 5257380962581683 5577483128229669 5661819269445876 5671348187096692
 5784686375395380 5795626689544539 5915891114492596 5975270769354417
 5996982621454469 6049616542527821 6125797751768025 6174559182308122
 6219238634336382 6776904214455240 6984795534098127 7107467078128879
 7214837915436490 7238936669483666 7280963829231048 7324887971716592
 7338934618553557 7492940622489570 7499289351166761 7545819552904208
 7646245348474631 7707617017326022 7850942767136368 7

75 users that went over the limit on 10/29/2015 : [1106824181265726 1460880989446247 1749458277555747 2245942585429940
 2302576486327459 2366928097135853 2505223645294729 2610112472096585
 2891791194252089 2980539633198204 3138132199016625 3264419298955673
 3276369883343727 3355576223096097 3369600965634913 3370960377586437
 3546693056773873 3676109815092640 3797102737432115 3929517687134990
 3936887050542904 4052848131106690 4118286032166087 4298557099672376
 4318352196714983 4462290711836916 4973517790485920 5199442973583621
 5257380962581683 5488856737032471 5612235316109460 5723635641134781
 5795626689544539 5899644472359642 5915891114492596 5975270769354417
 5996982621454469 6174559182308122 6198761755487915 6292410823269309
 6358192544004241 6497866359354370 6766253113444560 6984795534098127
 7107467078128879 7198750113791865 7214837915436490 7238936669483666
 7280963829231048 7299183791723634 7324887971716592 7338934618553557
 7492940622489570 7499289351166761 7545819552904208 7

82 users that went over the limit on 08/02/2015 : [1106824181265726 1175962294549313 1246716439259317 1280981422329509
 1460880989446247 1833346877787047 1850995745665541 2245942585429940
 2366928097135853 2505223645294729 2542445829224998 2610112472096585
 2756688131944353 2891791194252089 2980539633198204 3095443081295019
 3138132199016625 3264419298955673 3281814060807145 3355576223096097
 3547198874425548 3676109815092640 3797102737432115 3929517687134990
 3936887050542904 4052848131106690 4118286032166087 4324769211499741
 4572984294472212 4631597686439269 4973517790485920 4993234579335307
 5167229387043743 5257380962581683 5577483128229669 5661819269445876
 5671348187096692 5723635641134781 5784686375395380 5795626689544539
 5915891114492596 5975270769354417 5996982621454469 6049616542527821
 6125797751768025 6174559182308122 6219238634336382 6292410823269309
 6368470078113844 6766253113444560 6776904214455240 6984795534098127
 7107467078128879 7198750113791865 7214837915436490 7

47 users that went over the limit on 09/28/2015 : [1106824181265726 1175962294549313 1190980117697422 1246716439259317
 1460880989446247 2097167243683055 2245942585429940 2302576486327459
 2542445829224998 3264419298955673 3276369883343727 3281814060807145
 4052848131106690 4118286032166087 4462290711836916 4564117045739728
 4569281393242605 4572984294472212 4973517790485920 5257380962581683
 5488856737032471 5661819269445876 5723635641134781 5784686375395380
 5795626689544539 5975270769354417 6174559182308122 6766253113444560
 6984795534098127 7107467078128879 7198750113791865 7214837915436490
 7280963829231048 7299183791723634 7350222978998674 7499289351166761
 7509272878525535 7556827548313098 7850942767136368 7922818627489943
 7943675133681182 8210265648016159 8522875529951473 8896425420278012
 8972201384562696 9632319271199136 9999757432802760]
50 users that went over the limit on 09/01/2015 : [1106824181265726 1175962294549313 1190980117697422 1246716439259317
 1460880989446247 2

77 users that went over the limit on 08/19/2015 : [1106824181265726 1175962294549313 1246716439259317 1280981422329509
 1460880989446247 1833346877787047 1850995745665541 2245942585429940
 2366928097135853 2505223645294729 2542445829224998 2610112472096585
 2756688131944353 2891791194252089 2980539633198204 3095443081295019
 3138132199016625 3264419298955673 3276369883343727 3281814060807145
 3355576223096097 3929517687134990 3936887050542904 4052848131106690
 4063875032497374 4118286032166087 4298557099672376 4324769211499741
 4572984294472212 4631597686439269 4973517790485920 4993234579335307
 5167229387043743 5257380962581683 5577483128229669 5612235316109460
 5661819269445876 5671348187096692 5723635641134781 5795626689544539
 5915891114492596 5975270769354417 5996982621454469 6049616542527821
 6174559182308122 6219238634336382 6766253113444560 6776904214455240
 6984795534098127 7107467078128879 7214837915436490 7238936669483666
 7266500047328736 7280963829231048 7299183791723634 7

87 users that went over the limit on 08/21/2015 : [1106824181265726 1175962294549313 1246716439259317 1280981422329509
 1460880989446247 1833346877787047 1850995745665541 2032689281683871
 2245942585429940 2366928097135853 2505223645294729 2542445829224998
 2610112472096585 2756688131944353 2850146878241916 2891791194252089
 2980539633198204 3095443081295019 3138132199016625 3264419298955673
 3276369883343727 3281814060807145 3355576223096097 3369600965634913
 3547198874425548 3676109815092640 3797102737432115 3929517687134990
 3936887050542904 4052848131106690 4063875032497374 4118286032166087
 4298557099672376 4324769211499741 4572984294472212 4631597686439269
 4973517790485920 4993234579335307 5167229387043743 5257380962581683
 5577483128229669 5612235316109460 5661819269445876 5671348187096692
 5723635641134781 5795626689544539 5915891114492596 5975270769354417
 5996982621454469 6049616542527821 6125797751768025 6174559182308122
 6219238634336382 6292410823269309 6368470078113844 6

75 users that went over the limit on 10/09/2015 : [1106824181265726 1460880989446247 1749458277555747 1934150487562155
 2245942585429940 2302576486327459 2366928097135853 2505223645294729
 2610112472096585 2891791194252089 2980539633198204 3138132199016625
 3264419298955673 3276369883343727 3355576223096097 3369600965634913
 3370960377586437 3546693056773873 3676109815092640 3797102737432115
 3929517687134990 3936887050542904 4052848131106690 4118286032166087
 4298557099672376 4318352196714983 4462290711836916 4973517790485920
 5199442973583621 5257380962581683 5488856737032471 5612235316109460
 5795626689544539 5899644472359642 5915891114492596 5975270769354417
 5996982621454469 6174559182308122 6198761755487915 6292410823269309
 6358192544004241 6497866359354370 6766253113444560 6984795534098127
 7107467078128879 7198750113791865 7214837915436490 7238936669483666
 7266500047328736 7280963829231048 7299183791723634 7338934618553557
 7492940622489570 7499289351166761 7545819552904208 7

81 users that went over the limit on 08/16/2015 : [1106824181265726 1175962294549313 1246716439259317 1280981422329509
 1460880989446247 1833346877787047 1850995745665541 2032689281683871
 2245942585429940 2366928097135853 2505223645294729 2542445829224998
 2610112472096585 2756688131944353 2850146878241916 2891791194252089
 2980539633198204 3095443081295019 3138132199016625 3264419298955673
 3276369883343727 3281814060807145 3355576223096097 3369600965634913
 3547198874425548 3797102737432115 3929517687134990 4052848131106690
 4063875032497374 4118286032166087 4298557099672376 4324769211499741
 4462290711836916 4572984294472212 4631597686439269 4973517790485920
 4993234579335307 5167229387043743 5257380962581683 5577483128229669
 5661819269445876 5671348187096692 5795626689544539 5915891114492596
 5996982621454469 6049616542527821 6125797751768025 6174559182308122
 6219238634336382 6368470078113844 6766253113444560 6776904214455240
 6984795534098127 7107467078128879 7198750113791865 7

66 users that went over the limit on 10/19/2015 : [1106824181265726 1460880989446247 1749458277555747 1934150487562155
 2245942585429940 2302576486327459 2366928097135853 2505223645294729
 2610112472096585 2891791194252089 2980539633198204 3138132199016625
 3264419298955673 3276369883343727 3355576223096097 3369600965634913
 3546693056773873 3676109815092640 3797102737432115 3929517687134990
 3936887050542904 4052848131106690 4298557099672376 4318352196714983
 4973517790485920 5199442973583621 5257380962581683 5488856737032471
 5612235316109460 5723635641134781 5795626689544539 5915891114492596
 5975270769354417 5996982621454469 6174559182308122 6198761755487915
 6292410823269309 6358192544004241 6497866359354370 6984795534098127
 7107467078128879 7198750113791865 7214837915436490 7238936669483666
 7299183791723634 7324887971716592 7338934618553557 7492940622489570
 7545819552904208 7556827548313098 7762807525339038 7850942767136368
 7922818627489943 7924297455503050 7943675133681182 8

## Problem statement 3 : Unsupervised learning algorithm to spot unusual activity

In [8]:
# One way to identify suspicious activity is by checking if a card was used in multiple locations 
# in the same time frame. To track this we can create a new column which lists the total number of locations 
# a given card was used in
credit_card_total['transactions_that_day'] = credit_card_total.groupby(
    ['credit_card','formatted_date'])['transaction_dollar_amount'].transform('count')
credit_card_total.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat,month,year,day,day_of_week,seconds,hour,minute,formatted_date,city,state,zipcode,credit_card_limit,card_total_by_month,balance,transactions_that_day
0,1003715054175576,2015-09-11 00:32:40,43.78,-80.174132,40.26737,9,2015,11,4,40,0,32,09/11/2015,Houston,PA,15342,20000,11281.56,8718.44,7
1,1003715054175576,2015-10-24 22:23:08,103.15,-80.19424,40.180114,10,2015,24,5,8,22,23,10/24/2015,Houston,PA,15342,20000,8954.21,11045.79,7
2,1003715054175576,2015-10-26 18:19:36,48.55,-80.211033,40.313004,10,2015,26,0,36,18,19,10/26/2015,Houston,PA,15342,20000,8954.21,11045.79,3
3,1003715054175576,2015-10-22 19:41:10,136.18,-80.174138,40.290895,10,2015,22,3,10,19,41,10/22/2015,Houston,PA,15342,20000,8954.21,11045.79,3
4,1003715054175576,2015-10-26 20:08:22,71.82,-80.23872,40.166719,10,2015,26,0,22,20,8,10/26/2015,Houston,PA,15342,20000,8954.21,11045.79,3


In [9]:
#df = credit_card_total
credit_card_total.sort_values(by = ['credit_card','date'], inplace = True)
credit_card_total.head()

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat,month,year,day,day_of_week,seconds,hour,minute,formatted_date,city,state,zipcode,credit_card_limit,card_total_by_month,balance,transactions_that_day
38,1003715054175576,2015-07-31 20:03:05,45.52,-80.186336,40.168399,7,2015,31,4,5,20,3,07/31/2015,Houston,PA,15342,20000,162.56,19837.44,3
194,1003715054175576,2015-07-31 20:25:28,96.1,-80.156132,40.222907,7,2015,31,4,28,20,25,07/31/2015,Houston,PA,15342,20000,162.56,19837.44,3
107,1003715054175576,2015-07-31 23:09:32,20.94,-80.262219,40.242532,7,2015,31,4,32,23,9,07/31/2015,Houston,PA,15342,20000,162.56,19837.44,3
124,1003715054175576,2015-08-01 10:48:03,51.27,-80.176899,40.313324,8,2015,1,5,3,10,48,08/01/2015,Houston,PA,15342,20000,8441.51,11558.49,4
137,1003715054175576,2015-08-01 17:43:43,127.99,-80.226671,40.295995,8,2015,1,5,43,17,43,08/01/2015,Houston,PA,15342,20000,8441.51,11558.49,4


In [10]:
distances = []
transaction_time_delta = []
speed_of_transaction = []
for card in tqdm(credit_card_total['credit_card'].unique()):# for each unique card
    df = credit_card_total[credit_card_total['credit_card'] == card]
    distances.append(0)
    transaction_time_delta.append(0)
    speed_of_transaction.append(0)
    for j in range(1,len(df)):
        distance = geopy.distance.vincenty((df['Lat'].iloc[j-1], df['Long'].iloc[j-1]),
                                          (df['Lat'].iloc[j], df['Long'].iloc[j])).km
        transaction = (df['date'].iloc[j] - df['date'].iloc[j-1])
        transaction = transaction.days * 24 + transaction.seconds/3600
        distances.append(distance)
        transaction_time_delta.append(transaction)
        if transaction != 0:
            speed_of_transaction.append(distance/transaction)
        else:
            speed_of_transaction.append(0)
print(len(distances))

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 984/984 [00:51<00:00, 19.03it/s]

294588





In [11]:
credit_card_total['distances'] = distances
credit_card_total['transaction_time_delta'] = transaction_time_delta
credit_card_total['speed_of_transaction'] = speed_of_transaction
credit_card_total['speed_of_transaction'].describe()

count    2.945880e+05
mean     1.403927e+04
std      6.997089e+05
min      0.000000e+00
25%      7.965559e-01
50%      4.135621e+00
75%      1.365485e+01
max      6.372910e+07
Name: speed_of_transaction, dtype: float64

In [12]:
credit_card_total[credit_card_total['transaction_time_delta']!=0].nlargest(10, 'speed_of_transaction')

Unnamed: 0,credit_card,date,transaction_dollar_amount,Long,Lat,month,year,day,day_of_week,seconds,...,city,state,zipcode,credit_card_limit,card_total_by_month,balance,transactions_that_day,distances,transaction_time_delta,speed_of_transaction
105034,4171614982554774,2015-08-22 22:58:38,25.14,-74.528962,42.204345,8,2015,22,5,38,...,Denver,NY,12421,10000,8865.03,1134.97,6,17702.526771,0.000278,63729100.0
130071,4833760023644511,2015-09-20 21:06:33,27.44,-75.959121,41.3005,9,2015,20,6,33,...,Dallas,PA,18612,30000,26958.2,3041.8,8,17547.07474,0.000278,63169470.0
130070,4833760023644511,2015-09-20 21:06:32,82.07,101.719247,-19.227191,9,2015,20,6,32,...,Dallas,PA,18612,30000,26958.2,3041.8,8,17539.328735,0.000278,63141580.0
44210,2267974573364254,2015-08-06 23:09:31,117.46,-73.239916,44.294973,8,2015,6,3,31,...,Charlotte,VT,5445,8000,4383.68,3616.32,5,16497.570212,0.000278,59391250.0
15477,1409322756311484,2015-08-07 19:47:11,73.4,-80.172747,40.277325,8,2015,7,4,11,...,Houston,PA,15342,15000,12231.45,2768.55,9,15542.335615,0.000278,55952410.0
22874,1664685555838993,2015-10-14 21:00:48,138.4,-72.080818,43.123124,10,2015,14,2,48,...,Washington,NH,3280,30000,20243.13,9756.87,6,15188.653841,0.000278,54679150.0
50168,2505223645294729,2015-08-24 18:59:12,15.53,52.002136,-33.595529,8,2015,24,0,12,...,Washington,NH,3280,4000,5399.15,-1399.15,5,15113.318414,0.000278,54407950.0
155751,5659288348108211,2015-08-06 20:58:51,89.94,-75.846933,43.164756,8,2015,6,3,51,...,Cleveland,NY,13042,20000,14643.65,5356.35,11,15046.664181,0.000278,54167990.0
185368,6483234875464386,2015-08-08 02:26:43,54.56,-72.1225,43.257534,8,2015,8,5,43,...,Washington,NH,3280,18000,15734.19,2265.81,7,14970.237293,0.000278,53892850.0
30050,1916247437596108,2015-08-30 22:41:17,8.72,142.441056,-0.84768,8,2015,30,6,17,...,Washington,NH,3280,5000,1286.13,3713.87,3,14196.214196,0.000278,51106370.0


In [13]:
credit_card_total.groupby(['credit_card', 'date'])['speed_of_transaction', 'distances', 'transaction_time_delta'].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,speed_of_transaction,distances,transaction_time_delta
credit_card,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1003715054175576,2015-07-31 20:03:05,0.000000,0.000000,0.000000
1003715054175576,2015-07-31 20:25:28,17.627991,6.576220,0.373056
1003715054175576,2015-07-31 23:09:32,3.396542,9.287654,2.734444
1003715054175576,2015-08-01 10:48:03,0.918906,10.697856,11.641944
1003715054175576,2015-08-01 17:43:43,0.670964,4.648287,6.927778
...,...,...,...,...
9999757432802760,2015-10-23 20:47:23,5.082401,7.839604,1.542500
9999757432802760,2015-10-24 01:12:54,1.570122,6.948227,4.425278
9999757432802760,2015-10-25 21:53:33,0.179332,8.012111,44.677500
9999757432802760,2015-10-27 21:38:09,0.268199,12.804698,47.743333


# Classification and assigning labels based on transaction speeds

In [None]:
from sklearn.cluster import KMeans
from scipy import stats
le = preprocessing.LabelEncoder()
X = credit_card_total.drop(['date', 'formatted_date', 'Long', 'Lat', 'year', 'zipcode', 'balance', 'state', 'city', 'credit_card'], axis = 1)
for i in X.columns:
    if X[i].dtypes == 'object':
        le.fit(X[i])
        X[i] = le.transform(X[i])
df_tr_std = stats.zscore(X)
kmeans = KMeans(n_clusters=2, n_init=100, max_iter=1000,random_state=0).fit(df_tr_std)

In [None]:
len(kmeans.labels_)
df1 = credit_card_total#[credit_card_total['transaction_time_delta'] > 0]
df1['label'] = kmeans.labels_
plt.scatter(df1[df1['label'] == 0]['distances'], 
            df1[df1['label'] == 0]['transaction_time_delta'], color = 'b', label = 'Not risky')
plt.scatter(df1[df1['label'] == 1]['distances'], 
            df1[df1['label'] == 1]['transaction_time_delta'], color = 'r', label = 'Risky')


plt.xlabel('Distance between two consecutive transaction points')
plt.ylabel('Time between two consecutive transactions')
plt.legend()

In [None]:
len(df1[df1['label'] == 1])/len(df1)*100

In [None]:
print(df1['label'].unique())
plt.hist(df1['label'])