In [174]:
import pandas as pd
import numpy as np

In [175]:
df = pd.read_csv('dataset/train.csv')

In [176]:
df.shape

(36992, 25)

In [177]:
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [178]:
def _joined_through_ref_cleaner(row):
    yesOrNo = 'Yes' if row["referral_id"] != 'xxxxxxxx' else 'No'
    row['joined_through_referral'] = yesOrNo if (row['joined_through_referral'] == '?') else row['joined_through_referral']
    return row

def joined_through_ref_cleaner(df):
    df2 = df.apply(lambda row:_joined_through_ref_cleaner(row),axis=1)
    return df2
    
def churn_risk_score_negative(df):
    df.loc[(df["churn_risk_score"] < 0) | (df["churn_risk_score"] > 5)] = int(df["churn_risk_score"].mean())
    return df

def joining_date_converter(df):
    minDate = pd.to_datetime(df['joining_date']).min()
#     print(minDate)
    df["joining_date"] = (pd.to_datetime(df["joining_date"]) - minDate).astype('timedelta64[D]')
    return df

def last_visit_time_converter(df):
    df['last_visit_time'] = pd.to_timedelta(df['last_visit_time'], unit='ns').dt.total_seconds().astype('int64')
#     df["last_visit_time"] = pd.to_datetime(df["last_visit_time"]).astype('int64') // 10**9
    return df

def transform_data(df):
    df1 = joined_through_ref_cleaner(df)
    df2 = joining_date_converter(df1)
    df3 = last_visit_time_converter(df2)
    return df3

In [179]:
df = transform_data(df)
# df = churn_risk_score_negative(df)

In [180]:
def dropAllNA(df):
    df["gender"] = df["gender"].replace("Unknown",np.NaN)
    df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("Error",np.NaN)
    df["medium_of_operation"] = df["medium_of_operation"].replace("?",np.NaN)
    
    df['gender'].fillna(df["gender"].value_counts().index[0],inplace=True)
    df['avg_frequency_login_days'].fillna(df["avg_frequency_login_days"].value_counts().index[0],inplace=True)
    df['medium_of_operation'].fillna(df["medium_of_operation"].value_counts().index[0],inplace=True)
    df['region_category'].fillna(df["region_category"].value_counts().index[0],inplace=True)
    df['preferred_offer_types'].fillna(df["preferred_offer_types"].value_counts().index[0],inplace=True)
    
    df['points_in_wallet'].fillna(df["points_in_wallet"].astype('float64').mean(),inplace=True)
    df['points_in_wallet'] = df['points_in_wallet'].astype('float64')
    return df

In [181]:
df = dropAllNA(df)

In [182]:
df["gender"].unique()

array(['F', 'M'], dtype=object)

In [183]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [184]:
df = df.fillna(df.mean())

In [185]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [186]:
df.shape

(36992, 25)

In [187]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [188]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [189]:
df.dtypes

customer_id                      object
Name                             object
age                               int64
gender                           object
security_no                      object
region_category                  object
membership_category              object
joining_date                    float64
joined_through_referral          object
referral_id                      object
preferred_offer_types            object
medium_of_operation              object
internet_option                  object
last_visit_time                   int64
days_since_last_login             int64
avg_time_spent                  float64
avg_transaction_value           float64
avg_frequency_login_days         object
points_in_wallet                float64
used_special_discount            object
offer_application_preference     object
past_complaint                   object
complaint_status                 object
feedback                         object
churn_risk_score                  int64


In [190]:
df2 = df.select_dtypes(include="object")
df2 = df2.astype(str).apply(label_encoder.fit_transform)
df2.dtypes

customer_id                     int32
Name                            int32
gender                          int32
security_no                     int32
region_category                 int32
membership_category             int32
joined_through_referral         int32
referral_id                     int32
preferred_offer_types           int32
medium_of_operation             int32
internet_option                 int32
avg_frequency_login_days        int32
used_special_discount           int32
offer_application_preference    int32
past_complaint                  int32
complaint_status                int32
feedback                        int32
dtype: object

In [191]:
df2.head()

Unnamed: 0,customer_id,Name,gender,security_no,region_category,membership_category,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,internet_option,avg_frequency_login_days,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,32243,28675,0,34811,2,3,0,11358,1,1,2,746,1,1,0,1,4
1,7055,34951,0,5804,0,4,1,2225,1,1,1,732,1,0,1,2,5
2,5910,25315,0,1534,1,2,1,471,1,1,2,779,0,1,1,3,3
3,32400,10462,1,32501,0,2,1,5506,1,1,1,1637,0,1,1,4,3
4,6070,24704,0,29819,0,2,0,11358,0,2,1,743,0,1,1,2,3


In [193]:
from sklearn.model_selection import train_test_split
X = df2[['gender',
       'region_category', 'membership_category', 
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option',
       'avg_frequency_login_days', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]
X[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login']] = df[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login']]
y = df["churn_risk_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# print(X.shape,y.shape)

KeyError: "['avg_transaction_value'] not in index"

In [None]:
X.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

In [None]:
y_train.isna().sum()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score,mean_squared_error
score = f1_score(y_test, y_pred,average="macro")
score

In [None]:
result_df = pd.DataFrame({
    "churn_risk_score":y_pred
})

In [None]:
result_df["churn_risk_score"].unique()

In [None]:
real_test_data = pd.read_csv("dataset/test.csv")
real_test_data.head()

In [None]:
real_test_data = transform_data(real_test_data)
real_test_data = dropAllNA(real_test_data)
real_test_data2 = real_test_data[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]

In [None]:
real_test_data2 = real_test_data2.fillna(real_test_data2.mean())

In [None]:
real_test_data2.dtypes

In [None]:
encoded_str = real_test_data2.select_dtypes(include="object")#.apply(label_encoder.fit_transform)

In [None]:
model_ip = encoded_str.astype(str).apply(label_encoder.fit_transform)

In [None]:
model_ip[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login']] = real_test_data2[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login']]
model_ip[['last_visit_time', 'joining_date']] = real_test_data2[['last_visit_time', 'joining_date']]
model_ip.head()

In [None]:
model_ip[['last_visit_time', 'joining_date']].isna().sum()

In [None]:
model_ip.dtypes

In [None]:
# str_needed = real_test_data.select_dtypes(exclude=["float64","int64"])
real_test_pred = model.predict(model_ip)

In [None]:
final_result_df = pd.DataFrame({
    "customer_id":real_test_data["customer_id"],
    "churn_risk_score":real_test_pred.astype(int)
})

In [None]:
final_result_df.head()

In [None]:
final_result_df.to_csv(f"try-score-{score}.csv")