# Key points before analyzing dataset:

## https://www.kaggle.com/competitions/kkbox-churn-prediction-challenge/data

## 1.  The criteria of "churn" is no new valid service subscription within 30 days after the current membership expires.

## 2. All 'is_churn' labels provided by the competition host on Kaggle are based on users with a membership expiration date that fall in March 2017. 

## 3. Host purposely removed all transactions that occurred after March 2017, meaning only expiration dates up until Feburary are relevant (acting like we received this data at the end of March)

In [190]:
import csv
import pandas as pd
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline

In [191]:
traindf = pd.read_csv('train_v2.csv')
traindf

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1
...,...,...
970955,OHnZbu+EVaP+vN7Z+OfT5OMcp90MWFZonmM0o3pb8FY=,0
970956,S92bDK//uI6hk3u1vuApro0qJBQOToBozZ7lL1yTC+w=,0
970957,eUa3xo16vpAjr43Cjlb6Kjf1NTILYyJIkBayJQdXWnw=,0
970958,iZE41tbAQ65rJq60olkJT4BJzuUAYgQdfbEemXe/TTk=,0


In [192]:
transdf = pd.read_csv('transactions_v2.csv', parse_dates = ['transaction_date','membership_expire_date'])
transdf.shape

(1431009, 9)

In [193]:
df = traindf.merge(transdf, on = 'msno', how = 'left')
df = df.sort_values(['msno','transaction_date'])
df.dropna(inplace= True)
df.isna().sum()
df.dtypes

msno                              object
is_churn                           int64
payment_method_id                float64
payment_plan_days                float64
plan_list_price                  float64
actual_amount_paid               float64
is_auto_renew                    float64
transaction_date          datetime64[ns]
membership_expire_date    datetime64[ns]
is_cancel                        float64
dtype: object

In [194]:
dup = df[df.msno.duplicated()].reset_index()
dup = df[df.msno.isin(dup.msno)]
dup.dtypes


msno                              object
is_churn                           int64
payment_method_id                float64
payment_plan_days                float64
plan_list_price                  float64
actual_amount_paid               float64
is_auto_renew                    float64
transaction_date          datetime64[ns]
membership_expire_date    datetime64[ns]
is_cancel                        float64
dtype: object

In [195]:
dup['prev_user'] = dup.msno.shift(1)
dup.iloc[0,10] = '+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw='

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup['prev_user'] = dup.msno.shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [196]:
dup['avg_time_between_transactions'] = (dup.transaction_date -
                              dup.transaction_date.shift(1)).where(dup.prev_user == dup.msno, 0)
dup.iloc[0,11] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup['avg_time_between_transactions'] = (dup.transaction_date -


In [197]:
dup['avg_membership_duration'] = dup.membership_expire_date - dup.transaction_date
dup

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dup['avg_membership_duration'] = dup.membership_expire_date - dup.transaction_date


Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel,prev_user,avg_time_between_transactions,avg_membership_duration
969814,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,39.0,30.0,149.0,149.0,1.0,2017-02-28,2017-04-19,0.0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,50 days
969813,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-19,0.0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,31 days 00:00:00,49 days
250009,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,0,39.0,30.0,149.0,149.0,1.0,2017-02-28,2017-04-23,0.0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,54 days
250010,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-23,0.0,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,31 days 00:00:00,53 days
915216,++/ZHqwUNa7U21Qz+zqteiXlZapxey86l6eEorrak/g=,0,14.0,30.0,149.0,149.0,1.0,2017-02-28,2017-04-04,0.0,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,0,35 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...
952456,zzwsW1MnZalh6V5BwYE+V6hFWyKJYOf+rFd+SWVXxp8=,0,41.0,30.0,99.0,99.0,1.0,2017-03-08,2017-05-06,0.0,zzwsW1MnZalh6V5BwYE+V6hFWyKJYOf+rFd+SWVXxp8=,28 days 00:00:00,59 days
785031,zzxPS8+wCuE4HE85EDp4WBHgS5l548vfwXJbQq1HZ9I=,0,39.0,30.0,149.0,149.0,1.0,2017-02-28,2017-04-10,0.0,zzwsW1MnZalh6V5BwYE+V6hFWyKJYOf+rFd+SWVXxp8=,0,41 days
785030,zzxPS8+wCuE4HE85EDp4WBHgS5l548vfwXJbQq1HZ9I=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31,2017-05-10,0.0,zzxPS8+wCuE4HE85EDp4WBHgS5l548vfwXJbQq1HZ9I=,31 days 00:00:00,40 days
416202,zzz1Dc3P9s53HAowRTrm3fNsWju5yeN4YBfNDq7Z99Q=,0,39.0,30.0,149.0,149.0,1.0,2017-02-28,2017-04-24,0.0,zzxPS8+wCuE4HE85EDp4WBHgS5l548vfwXJbQq1HZ9I=,0,55 days


In [198]:
dup.avg_membership_duration = pd.to_timedelta(dup.avg_membership_duration, unit='d')
dup.avg_membership_duration = dup.avg_membership_duration.dt.days
dup.loc[dup.avg_membership_duration< 0] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [199]:
dup.avg_time_between_transactions = pd.to_timedelta(dup.avg_time_between_transactions, unit='d')
dup.avg_time_between_transactions = dup.avg_time_between_transactions.dt.days
dup.loc[dup.avg_time_between_transactions< 0] = 0

In [200]:
dup = dup.groupby('msno', as_index = False).agg({'is_churn': lambda x:x.value_counts().index[0],'payment_method_id': lambda x:x.value_counts().index[0], 'payment_plan_days': 'max', 'plan_list_price': 'max',
                                       'actual_amount_paid': 'max', 'is_auto_renew': lambda x:x.value_counts().index[0], 'transaction_date': 'max', 'membership_expire_date': 'max',
                                       'is_cancel': lambda x:x.value_counts().index[0], 'avg_membership_duration': 'mean', 'avg_time_between_transactions': 'mean'})
dup = dup.rename(columns = {'transaction_date': 'latest_transaction', 'membership_expire_date': 'latest_membership_expiration'})

In [201]:
dup.drop(index=dup.index[0], 
        axis=0, 
        inplace=True)

In [202]:
dup.is_churn.value_counts()

0    93069
1    22919
Name: is_churn, dtype: int64

In [203]:
nondup = df[~df.msno.isin(dup.msno)]
nondup.sort_values('transaction_date')
nondup['avg_membership_duration'] = nondup.membership_expire_date - nondup.transaction_date
nondup['avg_time_between_transactions'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nondup['avg_membership_duration'] = nondup.membership_expire_date - nondup.transaction_date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nondup['avg_time_between_transactions'] = 0


In [220]:
nondup.avg_membership_duration = pd.to_timedelta(nondup.avg_membership_duration, unit='d')
nondup.avg_membership_duration = nondup.avg_membership_duration.dt.days
nondup.loc[nondup.avg_membership_duration < 0, 'avg_membership_duration'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [221]:
nondup = nondup.groupby('msno', as_index = False).agg({'is_churn': lambda x:x.value_counts().index[0],'payment_method_id': lambda x:x.value_counts().index[0], 'payment_plan_days': 'max', 'plan_list_price': 'max',
                                       'actual_amount_paid': 'max', 'is_auto_renew': lambda x:x.value_counts().index[0], 'transaction_date': 'max', 'membership_expire_date': 'max',
                                       'is_cancel': lambda x:x.value_counts().index[0], 'avg_membership_duration': 'mean', 'avg_time_between_transactions': 'mean'})
nondup = nondup.rename(columns = {'transaction_date': 'latest_transaction', 'membership_expire_date': 'latest_membership_expiration'})


In [224]:
nondup.msno.value_counts()

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,latest_transaction,latest_membership_expiration,is_cancel,avg_membership_duration,avg_time_between_transactions
0,+++hVY1rZox/33YtvDgmKA2Frg/2qhkz12B9ylCvh8o=,0,41.0,30.0,99.0,99.0,1.0,2017-03-15,2017-04-15,0.0,31.0,0.0
1,+++snpr7pmobhLKUgSHTv/mpkqgBT0tQJ0zQj6qKrqc=,0,41.0,30.0,149.0,149.0,1.0,2017-03-26,2017-04-26,0.0,31.0,0.0
2,++/9R3sX37CjxbY/AaGvbwr3QkwElKBCtSvVzhCBDOk=,0,41.0,30.0,149.0,149.0,1.0,2017-03-15,2017-04-15,0.0,31.0,0.0
3,++0/NopttBsaAn6qHZA2AWWrDg7Me7UOMs1vsyo4tSI=,0,41.0,30.0,149.0,149.0,1.0,2017-03-20,2017-04-20,0.0,31.0,0.0
4,++0BJXY8tpirgIhJR14LDM1pnaRosjD1mdO1mIKxlJA=,0,38.0,30.0,149.0,149.0,0.0,2017-03-27,2017-04-26,0.0,30.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
817585,zzx4hKiyR9XFEGAr7SAjcCPbKJCZ+IqegWL7dPjPwZk=,0,41.0,30.0,99.0,99.0,1.0,2017-03-18,2017-04-19,0.0,32.0,0.0
817586,zzxZeMFx2fjfKZigMnJa2w0EmloDbm8+8nTf/o/00GY=,0,41.0,30.0,99.0,99.0,1.0,2017-03-26,2017-04-26,0.0,31.0,0.0
817587,zzxi7n5xoTYo9Q3VTygLWvl/rBDcexwaeAry0yK7Q0E=,0,41.0,30.0,99.0,99.0,1.0,2017-03-18,2017-04-18,0.0,31.0,0.0
817588,zzy7iqSpfcRq7R4hmKKuhI+CJRs79a6pteqEggpiNO0=,1,36.0,30.0,180.0,180.0,0.0,2017-02-28,2017-04-01,0.0,32.0,0.0


In [225]:
len(dup) + len(nondup)

933578

In [248]:
df = pd.concat([dup,nondup], ignore_index = True)
df

Unnamed: 0,msno,is_churn,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,latest_transaction,latest_membership_expiration,is_cancel,avg_membership_duration,avg_time_between_transactions
0,+++l/EXNMLTijfLBa8p2TUVVVp2aFGSuUI/h7mLmthw=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31 00:00:00,2017-05-19 00:00:00,0.0,49.5,15.5
1,++/UDNo9DLrxT8QVGiDi1OnWfczAdEwThaVyD0fXO50=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31 00:00:00,2017-05-23 00:00:00,0.0,53.5,15.5
2,++/ZHqwUNa7U21Qz+zqteiXlZapxey86l6eEorrak/g=,0,14.0,30.0,149.0,149.0,1.0,2017-03-31 00:00:00,2017-05-04 00:00:00,0.0,34.5,15.5
3,++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31 00:00:00,2017-05-21 00:00:00,0.0,51.5,15.5
4,++6P09mCSJSh+Ft2pvZ0FWTrtcI3v1A7h3/coh8dBKw=,0,39.0,30.0,149.0,149.0,1.0,2017-03-31 00:00:00,2017-05-13 00:00:00,0.0,43.5,15.5
...,...,...,...,...,...,...,...,...,...,...,...,...
933573,zzx4hKiyR9XFEGAr7SAjcCPbKJCZ+IqegWL7dPjPwZk=,0,41.0,30.0,99.0,99.0,1.0,2017-03-18 00:00:00,2017-04-19 00:00:00,0.0,32.0,0.0
933574,zzxZeMFx2fjfKZigMnJa2w0EmloDbm8+8nTf/o/00GY=,0,41.0,30.0,99.0,99.0,1.0,2017-03-26 00:00:00,2017-04-26 00:00:00,0.0,31.0,0.0
933575,zzxi7n5xoTYo9Q3VTygLWvl/rBDcexwaeAry0yK7Q0E=,0,41.0,30.0,99.0,99.0,1.0,2017-03-18 00:00:00,2017-04-18 00:00:00,0.0,31.0,0.0
933576,zzy7iqSpfcRq7R4hmKKuhI+CJRs79a6pteqEggpiNO0=,1,36.0,30.0,180.0,180.0,0.0,2017-02-28 00:00:00,2017-04-01 00:00:00,0.0,32.0,0.0


In [227]:
df.msno.duplicated().any()

False

In [229]:
df.latest_transaction.value_counts()

2017-03-31    181007
2017-03-16     30218
2017-03-05     29492
2017-03-06     29216
2017-03-20     28939
               ...  
2016-09-25         1
2015-10-05         1
2016-03-23         1
2016-08-19         1
2016-01-18         1
Name: latest_transaction, Length: 404, dtype: int64

In [230]:
df.is_cancel.value_counts()

0.0    920104
1.0     13474
Name: is_cancel, dtype: int64

In [250]:
userdf = pd.read_csv(r"C:\Users\marsh\Desktop\Springboard\Capstone2\user_logs_v2.csv", parse_dates = ['date'])
userdf

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,2017-03-31,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,2017-03-30,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,2017-03-31,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,2017-03-31,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,2017-03-31,2,1,0,1,112,93,28401.558
...,...,...,...,...,...,...,...,...,...
18396357,FGpiy2mB+vXLKziYRcY/xJcJEFJfRDfUqlU+p760f7E=,2017-03-14,0,0,0,0,1,1,248.058
18396358,iZRjKNMrw5ffEbfXODLhV/0tJLPbOH3am1WYDgqBf8Q=,2017-03-06,0,0,0,0,1,1,311.000
18396359,yztw4Y0EggG0w2wPkbMZx7ke7saSx7dLSfMheHZG/DQ=,2017-03-31,0,0,0,0,17,1,3973.189
18396360,swCHwkNx30/aENjq30qqaLlm7bUUytbMXdz1bH7g0Jk=,2017-03-07,0,0,0,1,0,1,179.278


In [251]:
userdf = userdf.groupby('msno', as_index = False).agg({'date': 'max', 'num_25': 'sum', 'num_50': 'sum', 'num_75': 'sum',
                                 'num_985': 'sum', 'num_100': 'sum', 'num_unq': 'max', 'total_secs': 'sum'})


In [252]:
userdf['percent_25'] = userdf['num_25']/(userdf['num_25']+userdf['num_50']+userdf['num_75']+userdf['num_985']+userdf['num_100'])
userdf['percent_50'] = userdf['num_50']/(userdf['num_25']+userdf['num_50']+userdf['num_75']+userdf['num_985']+userdf['num_100'])
userdf['percent_100'] = (userdf['num_985']+userdf['num_100'])/(userdf['num_25']+userdf['num_50']+userdf['num_75']+userdf['num_985']+userdf['num_100'])

In [253]:
userdf = userdf.drop(columns = ['num_25', 'num_50', 'num_75', 'num_985', 'num_100'])

In [267]:
df1 = df.merge(userdf, how = 'left')

In [268]:
df1.isnull().sum()

msno                                  0
is_churn                              0
payment_method_id                     0
payment_plan_days                     0
plan_list_price                       0
actual_amount_paid                    0
is_auto_renew                         0
latest_transaction                    0
latest_membership_expiration          0
is_cancel                             0
avg_membership_duration               0
avg_time_between_transactions         0
latest_listen_date               207842
num_unq                          207842
total_secs                       207842
percent_25                       207842
percent_50                       207842
percent_100                      207842
latest_listen_date               207842
date                             207842
dtype: int64

In [269]:
df1 = df1.dropna()

In [271]:
df1 = df1.rename(columns = {'date' : 'latest_listen_date'})

In [272]:
df1.isnull().sum()

msno                             0
is_churn                         0
payment_method_id                0
payment_plan_days                0
plan_list_price                  0
actual_amount_paid               0
is_auto_renew                    0
latest_transaction               0
latest_membership_expiration     0
is_cancel                        0
avg_membership_duration          0
avg_time_between_transactions    0
latest_listen_date               0
num_unq                          0
total_secs                       0
percent_25                       0
percent_50                       0
percent_100                      0
latest_listen_date               0
latest_listen_date               0
dtype: int64

In [277]:
df1.to_csv('Capstone updated 1219.csv')