In [26]:
# In this challenge the definition of churn is “no new valid service subscription within 30 days after the current membership expires.”

import csv
import pandas as pd
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline

In [209]:
pd.set_option('display.max_columns', None)

In [27]:
# Train table contains user ID and whether or not they churned

traindf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\train_v2.csv")
traindf

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1
...,...,...
970955,OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=,0
970956,S92bDK//uI6hk3u1vuApro0qJBQOToBozZ7lL1yTC+w=,0
970957,eUa3xo16vpAjr43Cjlb6Kjf1NTILYyJIkBayJQdXWnw=,0
970958,iZE41tbAQ65rJq60olkJT4BJzuUAYgQdfbEemXe/TTk=,0


In [28]:
traindf.is_churn.value_counts()

0    883630
1     87330
Name: is_churn, dtype: int64

In [29]:
traindf.isnull().sum()

msno        0
is_churn    0
dtype: int64

In [30]:
traindf.msno.duplicated().any()

False

In [105]:
traindf.describe()

Unnamed: 0,is_churn
count,970960.0
mean,0.089942
std,0.286099
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [168]:
# members table includes a user's personal and registration information

memdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\members_v3.csv")
memdf

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915
...,...,...,...,...,...,...
6769468,VSGkb3hyBRUtb/b1MQUZbvOkktS3vKLnhMHW0CF8eyU=,1,0,,7,20151020
6769469,nWjH7glPkZ7jOVaCRwwjlpmp0T1hSWdv8hMJxiWCwKc=,1,0,,7,20151020
6769470,GH+b5+1tlv7ZZXsA8upBzVXMTLyffKcsF7WoU8b5rOI=,15,26,female,4,20151020
6769471,XVlwT3fdCFGKqerEKBzUIjK+jzI6jzSke4cDMVhYyjE=,1,0,,4,20151020


In [169]:
memdf.msno.duplicated().any()

False

In [155]:
# a lot of null values for gender
memdf.isnull().sum()

msno                            0
city                            0
bd                              0
gender                    4429505
registered_via                  0
registration_init_time          0
dtype: int64

In [165]:
# filter for rows where age is between 10-100 and gender is not null

memdf = memdf[(memdf.bd > 10) & (memdf.bd < 100)]
memdf.dropna(axis = 0, inplace = True)
memdf

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915
5,yLkV2gbZ4GLFwqTOXLVHz0VGrMYcgBGgKZ3kj9RiYu8=,4,30,male,9,20110916
7,WH5Jq4mgtfUFXh2yz+HrcTXKS4Oess4k4W3qKolAeb0=,5,34,male,9,20110916
8,tKmbR4X5VXjHmxERrckawEMZ4znVy1lAQIR1vV5rdNk=,5,19,male,9,20110917
9,I0yFvqMoNkM8ZNHb617e1RBzIS/YRKemHO7Wj13EtA0=,13,63,male,9,20110918
...,...,...,...,...,...,...
6769455,IwDZCuhB0UP7wrBwuqE1r1jQIA0yFSTqUGlGJnM+hMI=,1,52,male,4,20151020
6769462,y0rwiAeU5goDatsiUDulPdiJpohZ15hxUAkSJHccnvY=,8,32,female,9,20151020
6769464,2U0grRyv2DB3Pw3crMBhBfsWWq6VkGcK1IGVqmDmK0w=,5,23,female,4,20151020
6769467,tyoRqQ72Y2oCcqvdaIxG8T0Hs9DZvuuM8IeTwhxtdBQ=,6,26,female,7,20151020


In [166]:
# merge two tables on msno

df1 = traindf.merge(memdf, on = 'msno', how = 'inner')

In [171]:
# no null values

df1.isna().sum()

msno                      0
is_churn                  0
city                      0
bd                        0
gender                    0
registered_via            0
registration_init_time    0
dtype: int64

In [172]:
# no duplicates

df1.duplicated().any()

False

In [177]:
df1

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5,28,male,3,20131223
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13,20,male,3,20131223
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13,18,male,3,20131227
3,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13,35,female,7,20140125
4,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,1,9,28,female,3,20140202
...,...,...,...,...,...,...,...
377023,SjVZDYaiKgEHpFX1PcFDS94b9CFdaHjg78rfumtm/F4=,0,15,19,male,9,20110823
377024,5NE9tsGhg7HbFNMdPT+3tytaIKG5013bBQG2vQTe+OI=,0,4,23,female,9,20110826
377025,/QlhSIWEZelYnwttYJSQL50EJJ2yRV+ThQIVQLvctp8=,0,8,25,female,7,20110830
377026,OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=,0,13,28,male,7,20110831


In [31]:
#Transactions gives us payment details for each user as well as whether the subscription is still active

transdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\transactions_v2.csv")
transdf

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0
...,...,...,...,...,...,...,...,...,...
1431004,zwF50wwaJI2TBKWhB42HRBJ6EQK0jgSo1Xmwb9Jq3SU=,32,180,536,536,0,20170215,20170817,0
1431005,zx/h5MzQQmsSat04wSfGpHp6N8aWLLwM1+7OV7ujmPY=,41,30,149,149,1,20170306,20170406,0
1431006,zxvgjIKjy18Fm+cIWUfYKr68z09+ILBxuMW0DnbeUZ8=,41,30,99,99,1,20170308,20170408,0
1431007,zzNhkExbpzmpjp9tXefiCUBtgNLgS+vZE7fFfTRDJVc=,38,30,149,149,0,20170318,20170417,0


In [173]:
# The churn/renewal definition can be tricky due to KKBox's subscription model. Since the majority of KKBox's subscription 
# length is 30 days, a lot of users re-subscribe every month. The key fields to determine churn/renewal are transaction date,
# membership expiration date, and is_cancel. Note that the is_cancel field indicates whether a user actively cancels a 
# subscription. Subscription cancellation does not imply the user has churned. A user may cancel service subscription due to 
# change of service plans or other reasons. The criteria of "churn" is no new valid service subscription within 30 days after 
# the current membership expires.

transdf.duplicated().any()

False

In [174]:
transdf.msno.duplicated().sum()

233959

In [34]:
# there are now row duplicates but there are duplicate user ids. Why are there multiple transaction dates for the same user, all in such a small time window?

duplicate = transdf[transdf.msno.duplicated()]
duplicate

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
83,+hXL1k++YSAJWSqyZLw90gIMFQXG3zv639GSK8f5Qn0=,41,30,149,149,1,20151221,20181122,0
145,/BndJt9YSOh1kzEykXyHrQZKl943rqFrzR9efW2b7wE=,39,30,149,149,1,20170331,20170524,0
698,4CHAQfq5SJiIZSSs/q61nSjpD3r1kfO8S9R9+UJvie4=,41,30,99,99,1,20170322,20170422,0
1715,DgyEYyyhcOun8D+8Ln2Lpx8orfCrV8L6Fgvt/22a4ec=,15,360,1200,1200,0,20170120,20180117,0
1739,Du6MaYC9gJo8KEu2QVKWpzl448tbmM2iTImTAHAg8EY=,41,30,149,149,1,20151015,20191115,0
...,...,...,...,...,...,...,...,...,...
1430992,zqacrLPJEEQDwPwvmVh19LogZVcX1hxg/rmFgOACQRg=,39,30,149,149,1,20170331,20170523,0
1430993,zqsH77C0tnmJzRXyJ2EGECaaUOE0AT9rrO/BaGAD9is=,41,30,129,129,1,20170310,20170310,1
1430996,zs1Oc+7NwsWhcCeZ287H+lUakejR2LdNKVLPaQcUTJ8=,37,30,149,149,1,20170313,20170412,0
1430999,zt/SmfsXujpaCmvgdjvWa1MYYzLQLJkYFdr2Z7I0+Wo=,39,30,149,149,1,20170228,20170425,0


In [35]:
transdf[transdf.msno == '72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=']['transaction_date'].value_counts()

20160720    98
20160731    55
20160721    54
20160728     1
Name: transaction_date, dtype: int64

In [36]:
transdf[transdf.msno == '72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=']['membership_expire_date'].value_counts()

20201125    1
20180103    1
20190410    1
20190619    1
20190403    1
           ..
20190717    1
20201202    1
20180523    1
20190515    1
20181107    1
Name: membership_expire_date, Length: 208, dtype: int64

In [39]:
transdf[transdf.msno == 'WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=']

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
10936,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170216,20190529,0
25214,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170216,20190731,0
46591,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170214,20180627,0
46592,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170216,20181010,0
53863,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170212,20170510,0
...,...,...,...,...,...,...,...,...,...
1413421,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170212,20170517,0
1413422,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170214,20180418,0
1413423,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170216,20180926,0
1427749,WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=,38,7,0,0,0,20170216,20190501,0


In [40]:
# check for null values
transdf.isnull().sum()

msno                      0
payment_method_id         0
payment_plan_days         0
plan_list_price           0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
dtype: int64

In [41]:
transdf.describe()

Unnamed: 0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
count,1431009.0,1431009.0,1431009.0,1431009.0,1431009.0,1431009.0,1431009.0,1431009.0
mean,37.91835,66.0177,281.787,281.3172,0.7853025,20168480.0,20171100.0,0.02455121
std,4.964805,102.4864,435.1861,435.42,0.4106124,4858.797,3032.367,0.1547529
min,2.0,0.0,0.0,0.0,0.0,20150100.0,20160420.0,0.0
25%,36.0,30.0,99.0,99.0,1.0,20170230.0,20170410.0,0.0
50%,40.0,30.0,149.0,149.0,1.0,20170310.0,20170420.0,0.0
75%,41.0,30.0,149.0,149.0,1.0,20170320.0,20170500.0,0.0
max,41.0,450.0,2000.0,2000.0,1.0,20170330.0,20361020.0,1.0


In [42]:
transdf.dtypes

msno                      object
payment_method_id          int64
payment_plan_days          int64
plan_list_price            int64
actual_amount_paid         int64
is_auto_renew              int64
transaction_date           int64
membership_expire_date     int64
is_cancel                  int64
dtype: object

In [43]:
transdf.is_cancel.value_counts()

0    1395876
1      35133
Name: is_cancel, dtype: int64

In [44]:
transdf.is_auto_renew.value_counts()

1    1123775
0     307234
Name: is_auto_renew, dtype: int64

In [140]:
transdf[transdf.msno == '5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=']

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
892,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160104,20180610,0
7955,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20151215,20171119,0
7956,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160104,20190915,0
15076,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160107,20200301,0
15077,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160107,20200426,0
...,...,...,...,...,...,...,...,...,...
1381997,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160104,20190310,0
1396163,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160104,20190505,0
1403301,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160104,20181007,0
1424774,5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=,38,7,0,0,0,20160104,20181028,0


In [213]:
# merge df1 to transdf

df2 = df1.merge(transdf, on ='msno', how = 'inner')
df2

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13,20,male,3,20131223,36,30,180,180,0,20170311,20170411,0
1,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13,18,male,3,20131227,17,60,0,0,0,20170311,20170314,0
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13,18,male,3,20131227,15,90,300,300,0,20170314,20170615,0
3,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13,35,female,7,20140125,41,30,99,99,1,20170117,20170721,0
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13,35,female,7,20140125,41,30,99,99,1,20161201,20170605,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463696,5NE9tsGhg7HbFNMdPT+3tytaIKG5013bBQG2vQTe+OI=,0,4,23,female,9,20110826,37,30,149,149,1,20170326,20170425,0
463697,/QlhSIWEZelYnwttYJSQL50EJJ2yRV+ThQIVQLvctp8=,0,8,25,female,7,20110830,41,30,149,149,1,20170317,20170417,0
463698,OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=,0,13,28,male,7,20110831,40,30,149,149,1,20170304,20170403,0
463699,iZE41tbAQ65rJq60olkJT4BJzuUAYgQdfbEemXe/TTk=,0,5,25,female,9,20110905,39,30,149,149,1,20170331,20170513,0


In [45]:
# user table contains details of a user's listening behaviour. Num denotes the % of song completion

userdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\user_logs_v2.csv")
userdf

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,20170331,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,20170330,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,20170331,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,20170331,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,20170331,2,1,0,1,112,93,28401.558
...,...,...,...,...,...,...,...,...,...
18396357,FGpiy2mB+vXLKziYRcY/xJcJEFJfRDfUqlU+p760f7E=,20170314,0,0,0,0,1,1,248.058
18396358,iZRjKNMrw5ffEbfXODLhV/0tJLPbOH3am1WYDgqBf8Q=,20170306,0,0,0,0,1,1,311.000
18396359,yztw4Y0EggG0w2wPkbMZx7ke7saSx7dLSfMheHZG/DQ=,20170331,0,0,0,0,17,1,3973.189
18396360,swCHwkNx30/aENjq30qqaLlm7bUUytbMXdz1bH7g0Jk=,20170307,0,0,0,1,0,1,179.278


In [46]:
userdfdup = userdf[userdf.msno.duplicated()]
userdfdup.msno.value_counts()

+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=    30
IsW9Z97sfb46JJ8yA6BrpxN0tphytU6PlWs2UYhsKMI=    30
FeQoey6MjhjCmqS4nL3nROl9EEj4IOOjzaLLIdnRd6M=    30
MHIQR0Nj3FQR5HhlfhzNiWmPPjtS2wSNPhQM6UylL1Q=    30
nJ7TOQEv5WZA5KHDckVtxlQbXAsJYT9s3BNexbauTTY=    30
                                                ..
4iMd0o7pKhkXKUEZq3eW4k9ObbJVua6nn+hRBm8xZY0=     1
QXwqQbsV7tF02kmQzE7OeuP2kHDkOr3zd1PcPx+4dsg=     1
UK9oQnga5ZeIMDF+mAOuqgzAHZ2VoDwVu/IXs8VWvRY=     1
lhFB1CW8cihD8CPxaYYvDp2bGWgIFedOWgreAvCycAM=     1
swCHwkNx30/aENjq30qqaLlm7bUUytbMXdz1bH7g0Jk=     1
Name: msno, Length: 1016672, dtype: int64

In [139]:
userdf[userdf.msno == '5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=']

(24, 9)

In [49]:
userdf[userdf['msno'] == '+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw='].sort_values('date')

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
9202275,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170301,10,4,3,5,12,21,5192.039
6388245,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170302,25,2,2,5,27,50,8516.79
17446699,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170303,13,2,5,1,10,25,3711.547
10821115,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170304,26,2,3,6,25,53,8473.527
2011891,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170305,7,5,4,2,17,28,5564.371
7308332,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170306,2,3,4,6,14,20,5671.623
11749605,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170307,77,18,10,14,15,122,10269.273
16190021,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170308,17,6,5,6,53,77,14827.783
16023814,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170309,31,4,6,3,40,79,11316.426
11213179,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170310,7,2,2,5,24,33,8057.056


In [214]:
# merge df2 with user logs df, on user id and transaction/user log dates

df3 = df2.merge(userdf, left_on= ['msno','transaction_date'], right_on = ['msno','date'])

In [215]:
df3

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13,20,male,3,20131223,36,30,180,180,0,20170311,20170411,0,20170311,0,0,0,0,7,7,1681.037
1,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13,18,male,3,20131227,17,60,0,0,0,20170311,20170314,0,20170311,1,0,0,0,0,1,48.437
2,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13,35,female,7,20140125,41,30,99,99,1,20170316,20170918,0,20170316,0,1,0,0,136,32,33044.856
3,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,1,9,28,female,3,20140202,38,410,1788,1788,0,20170324,20180513,0,20170324,3,0,3,1,41,30,10846.695
4,dW/tPZMDh2Oz/ksduEctJbsz0MXw3kay/1AlZCq3EbI=,1,13,21,female,9,20140212,38,90,477,477,0,20170328,20170707,0,20170328,15,15,2,1,19,46,6432.444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229721,q18eydRjoRGx2rYbZZxB4jlBZa22VTmCWbgrXEWJfTA=,0,13,27,male,7,20110813,41,30,129,129,1,20170327,20170428,0,20170327,27,0,2,1,22,45,6359.268
229722,RarUowfkW6CrHjw8jMNjFURi17Ja0/docu2tyVfm5jI=,0,13,45,male,9,20110816,36,30,180,180,0,20170328,20170427,0,20170328,0,1,0,0,13,12,3457.122
229723,5dXGSlRH5LQVFbTUGQwfyyaFb8w/FVqfTS8N4aNEots=,0,5,21,male,9,20110819,38,30,149,149,0,20170309,20170408,0,20170309,13,7,10,11,40,44,13572.611
229724,/QlhSIWEZelYnwttYJSQL50EJJ2yRV+ThQIVQLvctp8=,0,8,25,female,7,20110830,41,30,149,149,1,20170317,20170417,0,20170317,16,4,6,3,108,96,26577.418


In [216]:
# no null values

df3.isnull().any()

msno                      False
is_churn                  False
city                      False
bd                        False
gender                    False
registered_via            False
registration_init_time    False
payment_method_id         False
payment_plan_days         False
plan_list_price           False
actual_amount_paid        False
is_auto_renew             False
transaction_date          False
membership_expire_date    False
is_cancel                 False
date                      False
num_25                    False
num_50                    False
num_75                    False
num_985                   False
num_100                   False
num_unq                   False
total_secs                False
dtype: bool

In [217]:
# no duplciated values

df3.duplicated().any()

False

In [218]:
# rename columns and change dtypes for date columns

df3 = df3.rename(columns = {'msno' : 'userid', 'bd': 'age'})
df3['registration_init_time'] =  pd.to_datetime(df3['registration_init_time'].astype(str), format='%Y%m%d')
df3['transaction_date'] =  pd.to_datetime(df3['transaction_date'].astype(str), format='%Y%m%d')
df3['membership_expire_date'] =  pd.to_datetime(df3['membership_expire_date'].astype(str), format='%Y%m%d')

In [219]:
# change dtypes for string and category columns

df3.date = pd.to_datetime(df3.date.astype(str), format='%Y%m%d')
df3.userid = df3.userid.astype(str)
df3.is_churn = df3.is_churn.astype('category')
df3.gender = df3.gender.astype(str)
df3.is_cancel = df3.is_cancel.astype('category')

In [220]:
df3.userid.value_counts().head()

791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=    22
MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=    17
WBpgK5pHlEe6kkc73mkEIXRsPUSFuuIImZ8tD9rQ7C0=    13
qP66pCFEiIONygGX9baEJQ5JKpVyFo2ao0AmTuw5ACY=    12
TKDg6+l5f7bp86Poq2ma/XotpfIaLHipbwJjvrWBmlI=     8
Name: userid, dtype: int64

In [222]:
# looks like only membership_expire_date is different and accounts for a majority of the duplicates. We must keep these 
# since they are important to determining user churn.

df3[df3.userid == '791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=']

Unnamed: 0,userid,is_churn,city,age,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
146343,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-06-15,0,2017-03-18,0,0,0,0,9,4,1957.162
146344,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-09-14,0,2017-03-18,0,0,0,0,9,4,1957.162
146345,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-07-27,0,2017-03-18,0,0,0,0,9,4,1957.162
146346,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-06-08,0,2017-03-18,0,0,0,0,9,4,1957.162
146347,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-07-13,0,2017-03-18,0,0,0,0,9,4,1957.162
146348,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-06-01,0,2017-03-18,0,0,0,0,9,4,1957.162
146349,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-08-10,0,2017-03-18,0,0,0,0,9,4,1957.162
146350,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-08-31,0,2017-03-18,0,0,0,0,9,4,1957.162
146351,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-07-06,0,2017-03-18,0,0,0,0,9,4,1957.162
146352,791c32ArWs05ZCpF+brfdTcgsMsOIKIxMCUe1Qh4OKE=,1,4,22,female,9,2015-04-15,38,7,35,35,0,2017-03-18,2017-07-20,0,2017-03-18,0,0,0,0,9,4,1957.162


In [227]:
# looks like only membership_expire_date is different and accounts for a majority of the duplicates. We must keep these 
# since they are important to determining user churn.

df3[df3.userid == 'MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=']

Unnamed: 0,userid,is_churn,city,age,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,listen_date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
144617,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-04-29,0,2017-03-11,5,3,0,1,30,20,7231.145
144618,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-07-01,0,2017-03-11,5,3,0,1,30,20,7231.145
144619,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-05-06,0,2017-03-11,5,3,0,1,30,20,7231.145
144620,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-04-15,0,2017-03-11,5,3,0,1,30,20,7231.145
144621,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-05-13,0,2017-03-11,5,3,0,1,30,20,7231.145
144622,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-03-25,0,2017-03-11,5,3,0,1,30,20,7231.145
144623,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-06-03,0,2017-03-11,5,3,0,1,30,20,7231.145
144624,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-06-10,0,2017-03-11,5,3,0,1,30,20,7231.145
144625,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-04-08,0,2017-03-11,5,3,0,1,30,20,7231.145
144626,MFrEH81DqeRbBIYFssxgeyQ0J5eq0JCQSt6jkx+9/ps=,1,5,22,female,4,2017-02-28,38,7,0,0,0,2017-03-11,2017-04-22,0,2017-03-11,5,3,0,1,30,20,7231.145


In [224]:
# change date column name to listen_date

df3 = df3.rename(columns = {'date':'listen_date'})

In [225]:
df3

Unnamed: 0,userid,is_churn,city,age,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,listen_date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13,20,male,3,2013-12-23,36,30,180,180,0,2017-03-11,2017-04-11,0,2017-03-11,0,0,0,0,7,7,1681.037
1,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13,18,male,3,2013-12-27,17,60,0,0,0,2017-03-11,2017-03-14,0,2017-03-11,1,0,0,0,0,1,48.437
2,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13,35,female,7,2014-01-25,41,30,99,99,1,2017-03-16,2017-09-18,0,2017-03-16,0,1,0,0,136,32,33044.856
3,moRTKhKIDvb+C8ZHOgmaF4dXMLk0jOn65d7a8tQ2Eds=,1,9,28,female,3,2014-02-02,38,410,1788,1788,0,2017-03-24,2018-05-13,0,2017-03-24,3,0,3,1,41,30,10846.695
4,dW/tPZMDh2Oz/ksduEctJbsz0MXw3kay/1AlZCq3EbI=,1,13,21,female,9,2014-02-12,38,90,477,477,0,2017-03-28,2017-07-07,0,2017-03-28,15,15,2,1,19,46,6432.444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229721,q18eydRjoRGx2rYbZZxB4jlBZa22VTmCWbgrXEWJfTA=,0,13,27,male,7,2011-08-13,41,30,129,129,1,2017-03-27,2017-04-28,0,2017-03-27,27,0,2,1,22,45,6359.268
229722,RarUowfkW6CrHjw8jMNjFURi17Ja0/docu2tyVfm5jI=,0,13,45,male,9,2011-08-16,36,30,180,180,0,2017-03-28,2017-04-27,0,2017-03-28,0,1,0,0,13,12,3457.122
229723,5dXGSlRH5LQVFbTUGQwfyyaFb8w/FVqfTS8N4aNEots=,0,5,21,male,9,2011-08-19,38,30,149,149,0,2017-03-09,2017-04-08,0,2017-03-09,13,7,10,11,40,44,13572.611
229724,/QlhSIWEZelYnwttYJSQL50EJJ2yRV+ThQIVQLvctp8=,0,8,25,female,7,2011-08-30,41,30,149,149,1,2017-03-17,2017-04-17,0,2017-03-17,16,4,6,3,108,96,26577.418


In [228]:
df3.dtypes

userid                            object
is_churn                        category
city                               int64
age                                int64
gender                            object
registered_via                     int64
registration_init_time    datetime64[ns]
payment_method_id                  int64
payment_plan_days                  int64
plan_list_price                    int64
actual_amount_paid                 int64
is_auto_renew                      int64
transaction_date          datetime64[ns]
membership_expire_date    datetime64[ns]
is_cancel                       category
listen_date               datetime64[ns]
num_25                             int64
num_50                             int64
num_75                             int64
num_985                            int64
num_100                            int64
num_unq                            int64
total_secs                       float64
dtype: object