# Exploratory Data Analysis

In [38]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import os

os.chdir("..")


In [3]:
# Read in datasets
members_df = pd.read_csv("artifacts/members_v3.csv")
transactions_df = pd.read_csv("artifacts/transactions_v2.csv")
user_logs_df = pd.read_csv("artifacts/user_logs_v2.csv")
train_df = pd.read_csv("artifacts/train_v2.csv")

In [4]:
members_df.head()

Unnamed: 0,msno,city,bd,gender,registered_via,registration_init_time
0,Rb9UwLQTrxzBVwCB6+bCcSQWZ9JiNLC9dXtM1oEsZA8=,1,0,,11,20110911
1,+tJonkh+O1CA796Fm5X60UMOtB6POHAwPjbTRVl/EuU=,1,0,,7,20110914
2,cV358ssn7a0f7jZOwGNWS07wCKVqxyiImJUX6xcIwKw=,1,0,,11,20110915
3,9bzDeJP6sQodK73K5CBlJ6fgIQzPeLnRl0p5B77XP+g=,1,0,,11,20110915
4,WFLY3s7z4EZsieHCt63XrsdtfTEmJ+2PnnKLH5GY4Tk=,6,32,female,9,20110915


In [5]:
transactions_df.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0


In [6]:
user_logs_df.head()

Unnamed: 0,msno,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,u9E91QDTvHLq6NXjEaWv8u4QIqhrHk72kE+w31Gnhdg=,20170331,8,4,0,1,21,18,6309.273
1,nTeWW/eOZA/UHKdD5L7DEqKKFTjaAj3ALLPoAWsU8n0=,20170330,2,2,1,0,9,11,2390.699
2,2UqkWXwZbIjs03dHLU9KHJNNEvEkZVzm69f3jCS+uLI=,20170331,52,3,5,3,84,110,23203.337
3,ycwLc+m2O0a85jSLALtr941AaZt9ai8Qwlg9n0Nql5U=,20170331,176,4,2,2,19,191,7100.454
4,EGcbTofOSOkMmQyN1NMLxHEXJ1yV3t/JdhGwQ9wXjnI=,20170331,2,1,0,1,112,93,28401.558


In [7]:
train_df.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1


### Joining Datasets together

In [27]:
# Join datasets together
train_df_all = train_df
for dataset in [user_logs_df, members_df, transactions_df]:
    train_df_all = pd.merge(train_df_all, dataset, how="inner", on="msno" )

# Drop id column
train_df_all = train_df_all.drop("msno", axis=1)
train_df_all.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16305887 entries, 0 to 16305886
Data columns (total 22 columns):
 #   Column                  Non-Null Count     Dtype  
---  ------                  --------------     -----  
 0   is_churn                16305887 non-null  int64  
 1   date                    16305887 non-null  int64  
 2   num_25                  16305887 non-null  int64  
 3   num_50                  16305887 non-null  int64  
 4   num_75                  16305887 non-null  int64  
 5   num_985                 16305887 non-null  int64  
 6   num_100                 16305887 non-null  int64  
 7   num_unq                 16305887 non-null  int64  
 8   total_secs              16305887 non-null  float64
 9   city                    16305887 non-null  int64  
 10  bd                      16305887 non-null  int64  
 11  gender                  8762707 non-null   object 
 12  registered_via          16305887 non-null  int64  
 13  registration_init_time  16305887 non-nul

A lot of null values for gender. Will drop column.

### Memory Reduction

In [28]:
def change_datatype(df):
    int_cols = list(df.select_dtypes(include=['int']).columns)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)

change_datatype(train_df_all)

def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)
        


mem = train_df_all.memory_usage(index=True).sum()
print("Previous mem")
print(mem/ 1024**2," MB")

change_datatype_float(train_df_all)

mem = train_df_all.memory_usage(index=True).sum()
print("New mem")
print(mem/ 1024**2," MB")

Previous mem
901.9294490814209  MB
New mem
839.7274265289307  MB


### Checking Proportion of churn

In [51]:
train_df_all["is_churn"].value_counts()


is_churn
0    14819540
1     1486347
Name: count, dtype: int64

Proportion of churn indicates dataset is unbalanced. Will require stratified splitting during train test splits.