In [17]:
# In this challenge the definition of churn is “no new valid service subscription within 30 days after the current membership expires.”

import csv
import pandas as pd
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline

In [102]:
# Train table contains user ID and whether or not they churned

traindf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\train_v2.csv")
traindf

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1
...,...,...
970955,OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=,0
970956,S92bDK//uI6hk3u1vuApro0qJBQOToBozZ7lL1yTC+w=,0
970957,eUa3xo16vpAjr43Cjlb6Kjf1NTILYyJIkBayJQdXWnw=,0
970958,iZE41tbAQ65rJq60olkJT4BJzuUAYgQdfbEemXe/TTk=,0


In [33]:
traindf.is_churn.value_counts()

0    883630
1     87330
Name: is_churn, dtype: int64

In [31]:
traindf = traindf.rename(columns = {'msno' : 'userid'})

In [11]:
traindf.isnull().sum()

msno        0
is_churn    0
dtype: int64

In [37]:
traindf.userid.duplicated().any()

False

In [89]:
#Transactions gives us payment details for each user as well as whether the subscription is still active

transdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\transactions_v2.csv")
transdf

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0
...,...,...,...,...,...,...,...,...,...
1431004,zwF50wwaJI2TBKWhB42HRBJ6EQK0jgSo1Xmwb9Jq3SU=,32,180,536,536,0,20170215,20170817,0
1431005,zx/h5MzQQmsSat04wSfGpHp6N8aWLLwM1+7OV7ujmPY=,41,30,149,149,1,20170306,20170406,0
1431006,zxvgjIKjy18Fm+cIWUfYKr68z09+ILBxuMW0DnbeUZ8=,41,30,99,99,1,20170308,20170408,0
1431007,zzNhkExbpzmpjp9tXefiCUBtgNLgS+vZE7fFfTRDJVc=,38,30,149,149,0,20170318,20170417,0


In [38]:
transdf = transdf.rename(columns = {'msno' : 'userid'})

In [45]:
transdf.duplicated().any()

False

In [50]:
transdf.userid.duplicated().sum()

131464

In [154]:
# there are now row duplicates but there are duplicate user ids. Why are there multiple transaction dates for the same user, all in such a small time window?

duplicate = transdf[transdf.msno.duplicated()]
duplicate

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
83,+hXL1k++YSAJWSqyZLw90gIMFQXG3zv639GSK8f5Qn0=,41,30,149,149,1,20151221,20181122,0
145,/BndJt9YSOh1kzEykXyHrQZKl943rqFrzR9efW2b7wE=,39,30,149,149,1,20170331,20170524,0
698,4CHAQfq5SJiIZSSs/q61nSjpD3r1kfO8S9R9+UJvie4=,41,30,99,99,1,20170322,20170422,0
1715,DgyEYyyhcOun8D+8Ln2Lpx8orfCrV8L6Fgvt/22a4ec=,15,360,1200,1200,0,20170120,20180117,0
1739,Du6MaYC9gJo8KEu2QVKWpzl448tbmM2iTImTAHAg8EY=,41,30,149,149,1,20151015,20191115,0
...,...,...,...,...,...,...,...,...,...
1430992,zqacrLPJEEQDwPwvmVh19LogZVcX1hxg/rmFgOACQRg=,39,30,149,149,1,20170331,20170523,0
1430993,zqsH77C0tnmJzRXyJ2EGECaaUOE0AT9rrO/BaGAD9is=,41,30,129,129,1,20170310,20170310,1
1430996,zs1Oc+7NwsWhcCeZ287H+lUakejR2LdNKVLPaQcUTJ8=,37,30,149,149,1,20170313,20170412,0
1430999,zt/SmfsXujpaCmvgdjvWa1MYYzLQLJkYFdr2Z7I0+Wo=,39,30,149,149,1,20170228,20170425,0


In [155]:
transdf[transdf.msno == '72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=']['transaction_date'].value_counts()

20160720    98
20160731    55
20160721    54
20160728     1
Name: transaction_date, dtype: int64

In [156]:
transdf[transdf.msno == '72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=']['membership_expire_date'].value_counts()

20201125    1
20180103    1
20190410    1
20190619    1
20190403    1
           ..
20190717    1
20201202    1
20180523    1
20190515    1
20181107    1
Name: membership_expire_date, Length: 208, dtype: int64

In [141]:
transdf[transdf.userid == 'WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=']

Unnamed: 0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
count,145.0,145.0,145.0,145.0,145.0,145.0,145.0,145.0
mean,38.0,7.0,0.0,0.0,0.0,20170220.0,20181870.0,0.0
std,0.0,0.0,0.0,0.0,0.0,28.77713,8138.96,0.0
min,38.0,7.0,0.0,0.0,0.0,20170210.0,20170400.0,0.0
25%,38.0,7.0,0.0,0.0,0.0,20170210.0,20171210.0,0.0
50%,38.0,7.0,0.0,0.0,0.0,20170220.0,20180820.0,0.0
75%,38.0,7.0,0.0,0.0,0.0,20170220.0,20190500.0,0.0
max,38.0,7.0,0.0,0.0,0.0,20170310.0,20200110.0,0.0


In [23]:
# check for null values
transdf.isnull().sum()

msno                      0
payment_method_id         0
payment_plan_days         0
plan_list_price           0
actual_amount_paid        0
is_auto_renew             0
transaction_date          0
membership_expire_date    0
is_cancel                 0
dtype: int64

In [24]:
# convert date column types from int to dates
transdf['transaction_date'] =  pd.to_datetime(transdf['transaction_date'].astype(str), format='%Y%m%d')
transdf['membership_expire_date'] =  pd.to_datetime(transdf['membership_expire_date'].astype(str), format='%Y%m%d')

In [27]:
transdf.describe()

Unnamed: 0,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,is_cancel
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,37.913627,65.996649,281.724649,281.243679,0.785229,0.024697
std,4.970272,102.46331,435.073503,435.307366,0.410664,0.1552
min,2.0,0.0,0.0,0.0,0.0,0.0
25%,36.0,30.0,99.0,99.0,1.0,0.0
50%,40.0,30.0,149.0,149.0,1.0,0.0
75%,41.0,30.0,149.0,149.0,1.0,0.0
max,41.0,450.0,2000.0,2000.0,1.0,1.0


In [25]:
transdf.dtypes

msno                              object
payment_method_id                  int64
payment_plan_days                  int64
plan_list_price                    int64
actual_amount_paid                 int64
is_auto_renew                      int64
transaction_date          datetime64[ns]
membership_expire_date    datetime64[ns]
is_cancel                          int64
dtype: object

In [28]:
transdf.is_cancel.value_counts()

0    975303
1     24697
Name: is_cancel, dtype: int64

In [30]:
transdf.is_auto_renew.value_counts()

1    785229
0    214771
Name: is_auto_renew, dtype: int64

In [92]:
# user table contains details of a user's listening behaviour. Num denotes the % of song completion

userdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\user_logs_v2.csv")
userdf

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,20170331,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,20170330,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,20170331,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,20170331,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,20170331,2,1,0,1,112,93,28401.558
...,...,...,...,...,...,...,...,...,...
18396357,FGpiy2mB+vXLKziYRcY/xJcJEFJfRDfUqlU+p760f7E=,20170314,0,0,0,0,1,1,248.058
18396358,iZRjKNMrw5ffEbfXODLhV/0tJLPbOH3am1WYDgqBf8Q=,20170306,0,0,0,0,1,1,311.000
18396359,yztw4Y0EggG0w2wPkbMZx7ke7saSx7dLSfMheHZG/DQ=,20170331,0,0,0,0,17,1,3973.189
18396360,swCHwkNx30/aENjq30qqaLlm7bUUytbMXdz1bH7g0Jk=,20170307,0,0,0,1,0,1,179.278


In [132]:
userdfdup = userdf[userdf.msno.duplicated()]
userdfdup.msno.value_counts()

+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=    30
IsW9Z97sfb46JJ8yA6BrpxN0tphytU6PlWs2UYhsKMI=    30
FeQoey6MjhjCmqS4nL3nROl9EEj4IOOjzaLLIdnRd6M=    30
MHIQR0Nj3FQR5HhlfhzNiWmPPjtS2wSNPhQM6UylL1Q=    30
nJ7TOQEv5WZA5KHDckVtxlQbXAsJYT9s3BNexbauTTY=    30
                                                ..
4iMd0o7pKhkXKUEZq3eW4k9ObbJVua6nn+hRBm8xZY0=     1
QXwqQbsV7tF02kmQzE7OeuP2kHDkOr3zd1PcPx+4dsg=     1
UK9oQnga5ZeIMDF+mAOuqgzAHZ2VoDwVu/IXs8VWvRY=     1
lhFB1CW8cihD8CPxaYYvDp2bGWgIFedOWgreAvCycAM=     1
swCHwkNx30/aENjq30qqaLlm7bUUytbMXdz1bH7g0Jk=     1
Name: msno, Length: 1016672, dtype: int64

In [134]:
userdf[userdf['msno'] == '+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=']

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
915096,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170321,15,7,5,11,10,39,6686.607
1354436,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170324,149,40,22,17,33,164,19057.543
2011891,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170305,7,5,4,2,17,28,5564.371
2632211,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170319,13,4,2,10,34,35,11433.651
3783597,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170327,13,8,8,8,108,47,29461.527
4298502,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170323,43,5,3,10,17,65,7446.88
5318757,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170311,31,10,8,5,16,57,8113.423
5823027,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170330,15,9,4,5,63,34,14689.203
6266341,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170312,6,3,5,5,18,31,7076.2
6388245,+PCyNW+cioNHafPsAKXRfBnOZnvSGrit4DfODkV8crw=,20170302,25,2,2,5,27,50,8516.79


In [93]:
# members table includes a user's personal and registration information

memdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\members_v3.csv")

In [88]:
memdf['registration_init_time'] =  pd.to_datetime(memdf['registration_init_time'].astype(str), format='%Y%m%d')
memdf = memdf.rename(columns = {'msno' : 'userid', 'bd': 'age'})
memdf.describe()

Unnamed: 0,userid,city,age,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,2011-09-11
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,2011-09-14
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,2011-09-15
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,2011-09-15
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,2011-09-15
...,...,...,...,...,...,...
999995,pyH4hpnklAcIsoQyRsWQB2DVrkTdJG4Guo6UIrafkiI=,8,23,female,4,2016-01-18
999996,xPvlcQcuSoUdKtHonU3RQxyUtR/k9WEcYioh1WS/xoc=,1,0,,7,2016-01-18
999997,9Gp8Fn5nbSCKmZJ6OEOisXxlKYwBYr6YFjSvxZxWBUw=,1,0,,4,2016-01-18
999998,2yVbg0K4+hfNImaKz+//sgZMO7IUo6cfe/U1VUPhdAg=,1,0,,4,2016-01-18


In [74]:
# a lot of null values for gender
memdf.isnull().sum()

userid                         0
city                           0
age                            0
gender                    607944
registered_via                 0
registration_init_time         0
dtype: int64

In [76]:
# outliers for age (min and max values are way off)

memdf[memdf.age < 0]

Unnamed: 0,userid,city,age,gender,registered_via,registration_init_time
4268,eY+UDYaAlACNn21sfSJ2qWdwqkRkOwjUvrzuhFdQy74=,12,-39,female,7,2011-04-23
61151,vNkNJQDBw9InczAp8FYSMjNEOJ0U9oSOXlaXoYOFm3Q=,1,-518,male,4,2016-12-16
62806,T3AH9ITIJQR8ww0WoahoWqb3LYTtt3bs+MylUHTPG5s=,22,-51,,9,2015-03-20
135945,GPlFUWdUlP8FsTkHX/vP0DPC/gVxeUR0Ud1D+m7Ohpo=,1,-506,male,3,2014-06-14
147468,tirJpLJooXkMQEPHHm98m+uk89BX//aMQRY7yxU3ur8=,1,-510,female,9,2014-11-05
147744,bSit4QBkFqZn5jXeFBTqUK7+NmEHTf9giy26K5o7c3k=,13,-36,,9,2014-11-23
176270,HQ+YhQ0vouICUSrmnc8OeSetEv5s2YDzkZ7N+Q4V/hw=,5,-43,,9,2012-04-20
193759,nOpjmG5Mqizm7z6tApDKi29/X+My10kG/JnssGCq6hc=,13,-958,female,9,2014-04-26
254050,8lU9EDGQYba5eZo+vxG38TGEtDSDcJW78QN2JStDgEE=,13,-48,,9,2015-02-17
279949,fspqYjHPLxu+wGdC0x8wrS01W63DcjtlgwdrvfxJpbY=,15,-43,,9,2015-04-05


In [135]:
# join traindf and memdf, resulting rows = traindf total rows, so no rows were dropped during the join.

df1 = traindf.merge(memdf, on = 'msno', how = 'inner')

In [136]:
df1

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1,5,28,male,3,20131223
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1,13,20,male,3,20131223
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1,13,18,male,3,20131227
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1,1,0,,7,20140109
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1,13,35,female,7,20140125
...,...,...,...,...,...,...,...
860962,/QlhSIWEZelYnwttYJSQL50EJJ2yRV+ThQIVQLvctp8=,0,8,25,female,7,20110830
860963,OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=,0,13,28,male,7,20110831
860964,S92bDK//uI6hk3u1vuApro0qJBQOToBozZ7lL1yTC+w=,0,1,0,,7,20110903
860965,eUa3xo16vpAjr43Cjlb6Kjf1NTILYyJIkBayJQdXWnw=,0,1,0,,7,20110905


In [115]:
df1.msno.duplicated().any()

False

In [137]:
# join df1 with memdf. There are more rows because there are duplciated user ids in userdf 

df2 = df1.merge(transdf, on = 'msno', how = 'inner')

In [129]:
df2[df2['msno'] == '72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=']

Unnamed: 0,msno,is_churn,city,bd,gender,registered_via,registration_init_time,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
686065,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160731,20201125,0
686066,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160720,20180103,0
686067,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160721,20191030,0
686068,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160720,20170816,0
686069,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160720,20171213,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686268,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160731,20210317,0
686269,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160720,20170913,0
686270,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160721,20190522,0
686271,72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=,1,4.0,42.0,male,9.0,20070417.0,38,7,0,0,0,20160721,20190508,0


In [125]:
df2duplicate = df2[df2.msno.duplicated()]
df2duplicate.msno.value_counts()

72gJqt1O31E/WoxAEYFn9LHNI6mAZFGera5Q6gvsFkA=    207
5ty4nZkq54z93wQtBN7RHVYj8rNghBDCVBH+3xmxf0I=    171
OGKDrZQDB3yewZhoSd5qqvmG5A1GcNTYMexO95NlH+g=    147
WHsCtkOVsauvqBL0ULuG38887y7aU8GXdCmJMjw6hjQ=    144
SNlFRAsmUqnXKPofSXA8WYUc5DtmLcUMy4pXSJ3Ohz0=    130
                                               ... 
+2xg76nc+yZl09yOjsfBl3UMUKO8ZCBF+e+rrO5SEW8=      1
49g/mQkTO8oyoQLidlPZ/oiBdCzW2VwAC4KE0i0VxJI=      1
O8vrhAt96I/UQupDLzEGRZfZrYiPaSMG0McyrdDH1OI=      1
ydM9PfwTeOyX+AHB+kK52o7kM0pKkvL1JF0paWUoR60=      1
iZE41tbAQ65rJq60olkJT4BJzuUAYgQdfbEemXe/TTk=      1
Name: msno, Length: 115989, dtype: int64

True