In [303]:
import pandas as pd
import numpy as np

In [304]:
df = pd.read_csv('dataset/train.csv')

In [305]:
df.shape

(36992, 25)

In [306]:
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [307]:
def _joined_through_ref_cleaner(row):
    yesOrNo = 'Yes' if row["referral_id"] != 'xxxxxxxx' else 'No'
    row['joined_through_referral'] = yesOrNo if (row['joined_through_referral'] == '?') else row['joined_through_referral']
    return row

def joined_through_ref_cleaner(df):
    df2 = df.apply(lambda row:_joined_through_ref_cleaner(row),axis=1)
    return df2
    
def churn_risk_score_negative(df):
    df.loc[(df["churn_risk_score"] < 0) | (df["churn_risk_score"] > 5)] = int(df["churn_risk_score"].mean())
    return df

def joining_date_converter(df):
    minDate = pd.to_datetime(df['joining_date']).min()
#     print(minDate)
    df["joining_date"] = (pd.to_datetime(df["joining_date"]) - minDate).astype('timedelta64[D]')
    return df

def last_visit_time_converter(df):
    df['last_visit_time'] = pd.to_timedelta(df['last_visit_time'], unit='ns').dt.total_seconds().astype('int64')
#     df["last_visit_time"] = pd.to_datetime(df["last_visit_time"]).astype('int64') // 10**9
    return df

def transform_data(df):
    df1 = joined_through_ref_cleaner(df)
    df2 = joining_date_converter(df1)
    df3 = last_visit_time_converter(df2)
    return df3

In [308]:
df = transform_data(df)
# df = churn_risk_score_negative(df)

In [311]:
def dropAllNA(df):
#     df["gender"] = df["gender"].replace("Unknown",np.NaN)
    df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("Error",np.NaN)
#     df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("-999",50)
    df["medium_of_operation"] = df["medium_of_operation"].replace("?",np.NaN)
    
#     df['gender'].fillna(df["gender"].value_counts().index[0],inplace=True)
    df['avg_frequency_login_days'].fillna(df[df["avg_frequency_login_days"] != '-999']['avg_frequency_login_days'].astype('float64').mean(),inplace=True)
    df['medium_of_operation'].fillna(df["medium_of_operation"].value_counts().index[0],inplace=True)
    df['region_category'].fillna(df["region_category"].value_counts().index[0],inplace=True)
    df['preferred_offer_types'].fillna(df["preferred_offer_types"].value_counts().index[0],inplace=True)
    
    df['points_in_wallet'].fillna(df["points_in_wallet"].astype('float64').mean(),inplace=True)
    df['points_in_wallet'] = df['points_in_wallet'].astype('float64')
    df['avg_frequency_login_days'] = df['avg_frequency_login_days'].astype("float64")
    return df

In [312]:
df = dropAllNA(df)

In [313]:
df["gender"].unique()

array(['F', 'M', 'Unknown'], dtype=object)

In [314]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [None]:
df = df.fillna(df.mean())

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [None]:
df.dtypes

In [None]:
df2 = df.select_dtypes(include="object")
df2 = df2.astype(str).apply(label_encoder.fit_transform)
df.dtypes

In [None]:
df2.head()

In [None]:
from sklearn.model_selection import train_test_split
X = df2[['gender',
       'region_category', 'membership_category', 
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option',
        'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]
X[['points_in_wallet', 'age', 'avg_time_spent', 'avg_frequency_login_days','last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']] = df[['points_in_wallet', 'age', 'avg_time_spent','avg_frequency_login_days', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']]
y = df["churn_risk_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# print(X.shape,y.shape)

In [None]:
X.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
model = RandomForestClassifier()
# model = AdaBoostClassifier()
# model = KNeighborsClassifier()
# model = DecisionTreeClassifier()

In [None]:
y_train.isna().sum()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score,mean_squared_error
score = f1_score(y_test, y_pred,average="macro")
score

In [None]:
result_df = pd.DataFrame({
    "churn_risk_score":y_pred
})

In [None]:
result_df["churn_risk_score"].unique()

In [None]:
real_test_data = pd.read_csv("dataset/test.csv")
real_test_data.head()

In [None]:
real_test_data = transform_data(real_test_data)
real_test_data = dropAllNA(real_test_data)
real_test_data2 = real_test_data[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]

In [None]:
real_test_data2 = real_test_data2.fillna(real_test_data2.mean())

In [None]:
real_test_data2.dtypes

In [None]:
encoded_str = real_test_data2.select_dtypes(include="object")#.apply(label_encoder.fit_transform)

In [None]:
model_ip = encoded_str.astype(str).apply(label_encoder.fit_transform)

In [None]:
model_ip[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value','avg_frequency_login_days']] = real_test_data2[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value','avg_frequency_login_days']].astype(str)
model_ip[['last_visit_time', 'joining_date']] = real_test_data2[['last_visit_time', 'joining_date']]
model_ip.head()

In [None]:
model_ip[['last_visit_time', 'joining_date']].isna().sum()

In [None]:
model_ip.dtypes

In [None]:
# str_needed = real_test_data.select_dtypes(exclude=["float64","int64"])
real_test_pred = model.predict(model_ip)

In [None]:
final_result_df = pd.DataFrame({
    "customer_id":real_test_data["customer_id"],
    "churn_risk_score":real_test_pred.astype(int)
})

In [None]:
final_result_df.head()

In [None]:
final_result_df.to_csv(f"try-score-{score}.csv")

In [None]:
# import seaborn as sns
# sns.pairplot(model_ip)