In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read dataset
test_dataset = pd.read_csv("../input/open-shopee-code-league-marketing-analytics/test.csv")
train_dataset = pd.read_csv("../input/open-shopee-code-league-marketing-analytics/train.csv")
user_dataset = pd.read_csv("../input/open-shopee-code-league-marketing-analytics/users.csv")
sample = pd.read_csv("../input/open-shopee-code-league-marketing-analytics/sample_submission_0_1.csv")

In [None]:
sample

In [None]:
# dtypes
train_dataset.dtypes

In [None]:
# Convert String into Integer (Last Open Day)
train_dataset["last_open_day"] = train_dataset["last_open_day"].replace("Never open",0)
train_dataset["last_open_day"] = train_dataset["last_open_day"].astype(int)

# Convert String into Integer (Last Login Day) 
train_dataset["last_login_day"] = train_dataset["last_login_day"].replace("Never login",0)
train_dataset["last_login_day"] = train_dataset["last_login_day"].astype(int)

# Convert String into Integer (Last Login Day) 
train_dataset["last_checkout_day"] = train_dataset["last_checkout_day"].replace("Never checkout",0)
train_dataset["last_checkout_day"] = train_dataset["last_checkout_day"].astype(int)

# Convert String into Integer (Last Open Day)
test_dataset["last_open_day"] = test_dataset["last_open_day"].replace("Never open",0)
test_dataset["last_open_day"] = test_dataset["last_open_day"].astype(int)

# Convert String into Integer (Last Login Day) 
test_dataset["last_login_day"] = test_dataset["last_login_day"].replace("Never login",0)
test_dataset["last_login_day"] = test_dataset["last_login_day"].astype(int)

# Convert String into Integer (Last Login Day) 
test_dataset["last_checkout_day"] = test_dataset["last_checkout_day"].replace("Never checkout",0)
test_dataset["last_checkout_day"] = test_dataset["last_checkout_day"].astype(int)

In [None]:
# New Train Dataset (Gabung User ID)
train_dataset_new = pd.merge(train_dataset,user_dataset, on = 'user_id', how = 'inner')

# Change to 0 the format NaN 
train_dataset_new = train_dataset_new.fillna(0)

# New Test Dataset (Gabung User ID)
test_dataset = pd.merge(test_dataset,user_dataset, on = 'user_id', how = 'inner')

# Change to 0 the format NaN 
test_dataset = test_dataset.fillna(0)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
# Deploy PCA to analyse deeply which features are affecting the value
def do_pca(n_components, data):
    '''
    Transforms data using PCA to create n_components, and provides back the results of the
    transformation.
    
    INPUT: n_components - int - the number of principal components to create
           data - the data you would like to transform
           
    OUTPUT: pca - the pca object created after fitting the data
            X_pca - the transformed X matrix with new number of components
    '''
    X = StandardScaler().fit_transform(data)
    pca = PCA(n_components)
    X_pca = pca.fit_transform(X)
    return pca, X_pca

In [None]:
def pca_results(full_dataset, pca):
    '''
    Create a DataFrame of the PCA results
    Includes dimension feature weights and explained variance
    Visualizes the PCA results
    '''

    # Dimension indexing
    dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]

    # PCA components
    components = pd.DataFrame(np.round(pca.components_, 4), columns = full_dataset.keys())
    components.index = dimensions

    # PCA explained variance
    ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
    variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
    variance_ratios.index = dimensions

    # Create a bar plot visualization
    fig, ax = plt.subplots(figsize = (14,8))

    # Plot the feature weights as a function of the components
    components.plot(ax = ax, kind = 'bar');
    ax.set_ylabel("Feature Weights")
    ax.set_xticklabels(dimensions, rotation=0)


    # Display the explained variance ratios
    for i, ev in enumerate(pca.explained_variance_ratio_):
        ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n          %.4f"%(ev))

    # Return a concatenated DataFrame
    return pd.concat([variance_ratios, components], axis = 1)

In [None]:
def scree_plot(pca):
    '''
    Creates a scree plot associated with the principal components 
    
    INPUT: pca - the result of instantian of PCA in scikit learn
            
    OUTPUT:
            None
    '''
    num_components=len(pca.explained_variance_ratio_)
    ind = np.arange(num_components)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(15, 20))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
    for i in range(num_components):
        ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=12)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')
    plt.show()

In [None]:
# pca, X_pca = do_pca(20, train_dataset_new)
# X_pca.shape

In [None]:
# scree_plot(pca)

In [None]:
# Change column type in training data
train_dataset_new['grass_date'] = pd.to_datetime(
    train_dataset_new.grass_date, 
    format='%Y-%m-%d %H:%M:%S', 
    errors='coerce'
)

# Change column type in test data
test_dataset['grass_date'] = pd.to_datetime(
    test_dataset.grass_date, 
    format='%Y-%m-%d %H:%M:%S', 
    errors='coerce'
)

# Derive more information from grass date in training data
train_dataset_new['day_of_year'] = train_dataset_new.grass_date.dt.dayofyear
train_dataset_new['day_of_month'] = train_dataset_new.grass_date.dt.day
train_dataset_new['day_of_week'] = train_dataset_new.grass_date.dt.dayofweek

# Derive more information from grass date in test data
test_dataset['day_of_year'] = test_dataset.grass_date.dt.dayofyear
test_dataset['day_of_month'] = test_dataset.grass_date.dt.day
test_dataset['day_of_week'] = test_dataset.grass_date.dt.dayofweek

In [None]:
# Eliminate the Outliers
def eliminate_outliers(dataset, name_columns):

    mean = np.mean(dataset[name_columns]) # Find the average mean
    std  = np.std(dataset[name_columns]) # Find the standard deviation
    
    max_range = mean + (2 * std)
    min_range = mean - (2 * std)
    
    # Eliminate the Row that not consist of these layers 
    dataset = dataset[dataset[name_columns] < max_range][dataset[name_columns] > min_range]    
    return(dataset)

# Testing
train_dataset_new = eliminate_outliers(train_dataset_new, "last_open_day")
train_dataset_new = eliminate_outliers(train_dataset_new, "last_login_day")
train_dataset_new = eliminate_outliers(train_dataset_new, "last_checkout_day")

In [None]:
# Specify input variables
variables = [
    'country_code',
    'day_of_year', 
    'day_of_month', 
    'day_of_week',
    'last_open_day',
    'last_login_day',
    'last_checkout_day',
    'open_count_last_10_days',
    'open_count_last_30_days',
    'open_count_last_60_days',
    'login_count_last_10_days',
    'login_count_last_30_days',
    'login_count_last_60_days',
    'checkout_count_last_10_days',
    'checkout_count_last_30_days',
    'checkout_count_last_60_days',
    'attr_1',
    'attr_2',
    'attr_3',
    'age'
]

In [None]:
from sklearn.model_selection import train_test_split
# Specify training and validation set
x = train_dataset_new[variables]
y = train_dataset_new['open_flag']
x_train, x_validation, y_train, y_validation = train_test_split(
    x, 
    y, 
    test_size=0.2,
    random_state=42 
)

In [None]:
# # Deploy SVM 
# from sklearn.svm import SVC
# SVC = SVC(kernel = 'rbf', C = 1000, random_state = 0)
# SVC.fit(x_train, y_train)

In [None]:
# # Evaluate the model
# from sklearn.metrics import matthews_corrcoef
# y_prediction = SVC.predict(x_validation)

# # Printing Results
# print(accuracy_score(y_validation, y_prediction))
# print(matthews_corrcoef(y_validation, y_prediction))

In [None]:
# Setup the list of hyperparameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef

hyperparameters = {
    "max_depth": [12, 9, 6, 3],
    "min_samples_leaf": [10, 50, 100,  200, 500],
    "min_samples_split": [10, 20, 30,  40, 50],
    "criterion": ["gini", "entropy"]
}

hyper_logistic = {
    "dual": ["True", "False"]
}

# Define classifier
classifier = DecisionTreeClassifier(random_state = 0 )
class_rand = RandomForestClassifier(random_state = 0)
LogReg_clf = LogisticRegression(random_state = 0)

# Define the model selection method
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

mcc_scorer = make_scorer(matthews_corrcoef)
model_selection = GridSearchCV(classifier, hyperparameters, cv=5, scoring=mcc_scorer)
model_selection_rand = GridSearchCV(class_rand, hyperparameters, cv=5, scoring=mcc_scorer)
model_selection_log = GridSearchCV(LogReg_clf, hyper_logistic, cv=5, scoring=mcc_scorer)

# Fitting
model_selection.fit(x_train,y_train)
model_selection_rand.fit(x_train, y_train)
model_selection_log.fit(x_train, y_train)


# Printing Best Params and Best Score
print(model_selection.best_params_)
print(model_selection.best_score_)
print(model_selection_rand.best_params_)
print(model_selection_rand.best_score_)
print(model_selection_log.best_params_)
print(model_selection_log.best_score_)

In [None]:
clf = RandomForestClassifier(max_depth=12, criterion = 'entropy', min_samples_split = 10, min_samples_leaf = 10)

# Fitting the Model
clf.fit(x_train, y_train)

y_prediction = clf.predict(x_validation)
# Printing Results
print(accuracy_score(y_validation, y_prediction))
print(matthews_corrcoef(y_validation, y_prediction))

In [None]:
# Deploy Random Forest 
from sklearn.metrics import matthews_corrcoef
from sklearn.ensemble import RandomForestClassifier
clf = DecisionTreeClassifier(max_depth=30, random_state=0, criterion = 'gini', min_samples_split = 10, min_samples_leaf = 5)

# Fitting the Model
clf.fit(x_train, y_train)

y_prediction = clf.predict(x_validation)
# Printing Results
print(accuracy_score(y_validation, y_prediction))
print(matthews_corrcoef(y_validation, y_prediction))

# Convert into Test Score

In [None]:
# Implement into the Test Score
clf = RandomForestClassifier(max_depth=12, criterion = 'entropy', min_samples_split = 10, min_samples_leaf = 10)
clf.fit(x, y)

sample["open_flag"] = clf.predict(test_dataset[variables])

# Export to a csv file
sample.loc[:, ['row_id', 'open_flag']].to_csv(
    'submission.csv', index=False, header=True
)