In [303]:
import pandas as pd
import numpy as np

In [304]:
df = pd.read_csv('dataset/train.csv')

In [305]:
df.shape

(36992, 25)

In [306]:
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [307]:
def _joined_through_ref_cleaner(row):
    yesOrNo = 'Yes' if row["referral_id"] != 'xxxxxxxx' else 'No'
    row['joined_through_referral'] = yesOrNo if (row['joined_through_referral'] == '?') else row['joined_through_referral']
    return row

def joined_through_ref_cleaner(df):
    df2 = df.apply(lambda row:_joined_through_ref_cleaner(row),axis=1)
    return df2
    
def churn_risk_score_negative(df):
    df.loc[(df["churn_risk_score"] < 0) | (df["churn_risk_score"] > 5)] = int(df["churn_risk_score"].mean())
    return df

def joining_date_converter(df):
    minDate = pd.to_datetime(df['joining_date']).min()
#     print(minDate)
    df["joining_date"] = (pd.to_datetime(df["joining_date"]) - minDate).astype('timedelta64[D]')
    return df

def last_visit_time_converter(df):
    df['last_visit_time'] = pd.to_timedelta(df['last_visit_time'], unit='ns').dt.total_seconds().astype('int64')
#     df["last_visit_time"] = pd.to_datetime(df["last_visit_time"]).astype('int64') // 10**9
    return df

def transform_data(df):
    df1 = joined_through_ref_cleaner(df)
    df2 = joining_date_converter(df1)
    df3 = last_visit_time_converter(df2)
    return df3

In [308]:
df = transform_data(df)
# df = churn_risk_score_negative(df)

In [311]:
def dropAllNA(df):
#     df["gender"] = df["gender"].replace("Unknown",np.NaN)
    df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("Error",np.NaN)
#     df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("-999",50)
    df["medium_of_operation"] = df["medium_of_operation"].replace("?",np.NaN)
    
#     df['gender'].fillna(df["gender"].value_counts().index[0],inplace=True)
    df['avg_frequency_login_days'].fillna(df[df["avg_frequency_login_days"] != '-999']['avg_frequency_login_days'].astype('float64').mean(),inplace=True)
    df['medium_of_operation'].fillna(df["medium_of_operation"].value_counts().index[0],inplace=True)
    df['region_category'].fillna(df["region_category"].value_counts().index[0],inplace=True)
    df['preferred_offer_types'].fillna(df["preferred_offer_types"].value_counts().index[0],inplace=True)
    
    df['points_in_wallet'].fillna(df["points_in_wallet"].astype('float64').mean(),inplace=True)
    df['points_in_wallet'] = df['points_in_wallet'].astype('float64')
    df['avg_frequency_login_days'] = df['avg_frequency_login_days'].astype("float64")
    return df

In [312]:
df = dropAllNA(df)

In [313]:
df["gender"].unique()

array(['F', 'M', 'Unknown'], dtype=object)

In [314]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [315]:
df = df.fillna(df.mean())

In [316]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [317]:
df.shape

(36992, 25)

In [318]:
df.isnull().sum()

customer_id                     0
Name                            0
age                             0
gender                          0
security_no                     0
region_category                 0
membership_category             0
joining_date                    0
joined_through_referral         0
referral_id                     0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
last_visit_time                 0
days_since_last_login           0
avg_time_spent                  0
avg_transaction_value           0
avg_frequency_login_days        0
points_in_wallet                0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
churn_risk_score                0
dtype: int64

In [319]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [320]:
df.dtypes

customer_id                      object
Name                             object
age                               int64
gender                           object
security_no                      object
region_category                  object
membership_category              object
joining_date                    float64
joined_through_referral          object
referral_id                      object
preferred_offer_types            object
medium_of_operation              object
internet_option                  object
last_visit_time                   int64
days_since_last_login             int64
avg_time_spent                  float64
avg_transaction_value           float64
avg_frequency_login_days        float64
points_in_wallet                float64
used_special_discount            object
offer_application_preference     object
past_complaint                   object
complaint_status                 object
feedback                         object
churn_risk_score                  int64


In [321]:
df2 = df.select_dtypes(include="object")
df2 = df2.astype(str).apply(label_encoder.fit_transform)
df.dtypes

customer_id                      object
Name                             object
age                               int64
gender                           object
security_no                      object
region_category                  object
membership_category              object
joining_date                    float64
joined_through_referral          object
referral_id                      object
preferred_offer_types            object
medium_of_operation              object
internet_option                  object
last_visit_time                   int64
days_since_last_login             int64
avg_time_spent                  float64
avg_transaction_value           float64
avg_frequency_login_days        float64
points_in_wallet                float64
used_special_discount            object
offer_application_preference     object
past_complaint                   object
complaint_status                 object
feedback                         object
churn_risk_score                  int64


In [322]:
df2.head()

Unnamed: 0,customer_id,Name,gender,security_no,region_category,membership_category,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,32243,28675,0,34811,2,3,0,11358,1,1,2,1,1,0,1,4
1,7055,34951,0,5804,0,4,1,2225,1,1,1,1,0,1,2,5
2,5910,25315,0,1534,1,2,1,471,1,1,2,0,1,1,3,3
3,32400,10462,1,32501,0,2,1,5506,1,1,1,0,1,1,4,3
4,6070,24704,0,29819,0,2,0,11358,0,2,1,0,1,1,2,3


In [323]:
from sklearn.model_selection import train_test_split
X = df2[['gender',
       'region_category', 'membership_category', 
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option',
        'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]
X[['points_in_wallet', 'age', 'avg_time_spent', 'avg_frequency_login_days','last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']] = df[['points_in_wallet', 'age', 'avg_time_spent','avg_frequency_login_days', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']]
y = df["churn_risk_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# print(X.shape,y.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [324]:
X.head()

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,points_in_wallet,age,avg_time_spent,avg_frequency_login_days,last_visit_time,joining_date,days_since_last_login,avg_transaction_value
0,0,2,3,0,1,1,2,1,1,0,1,4,781.75,18,300.63,17.0,58082,959.0,17,53005.25
1,0,0,4,1,1,1,1,1,0,1,2,5,686.882199,32,306.34,10.0,45493,970.0,16,12838.38
2,0,1,2,1,1,1,2,0,1,1,3,3,500.69,44,516.16,22.0,82401,680.0,14,21027.0
3,1,0,2,1,1,1,1,0,1,1,4,3,567.66,37,53.27,6.0,57470,667.0,11,25239.56
4,0,0,2,0,0,2,1,0,1,1,2,3,663.06,31,113.13,16.0,56804,985.0,20,24483.66


In [325]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
model = RandomForestClassifier()
# model = AdaBoostClassifier()
# model = KNeighborsClassifier()
# model = DecisionTreeClassifier()

In [326]:
y_train.isna().sum()

0

In [327]:
model.fit(X_train,y_train)

RandomForestClassifier()

In [328]:
y_pred = model.predict(X_test)

In [329]:
from sklearn.metrics import f1_score,mean_squared_error
score = f1_score(y_test, y_pred,average="macro")
score

0.6182960218743067

In [330]:
result_df = pd.DataFrame({
    "churn_risk_score":y_pred
})

In [331]:
result_df["churn_risk_score"].unique()

array([1, 4, 5, 3, 2], dtype=int64)

In [332]:
real_test_data = pd.read_csv("dataset/test.csv")
real_test_data.head()

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,fffe43004900440031003700300030003400,Alethia Meints,50,F,OQJ1XAY,Village,Premium Membership,2015-11-02,No,xxxxxxxx,...,12,386.26,40721.44,7.0,733.83,Yes,No,No,Not Applicable,Poor Product Quality
1,fffe43004900440031003900370037003300,Ming Lopez,41,M,OUQRPKO,Village,Gold Membership,2016-03-01,No,xxxxxxxx,...,11,37.8,9644.4,9.0,726.0,Yes,No,No,Not Applicable,Poor Website
2,fffe43004900440034003800360037003000,Carina Flannigan,31,F,02J2RE7,Town,Silver Membership,2017-03-03,No,xxxxxxxx,...,18,215.36,3693.25,21.0,713.78,Yes,No,Yes,Solved in Follow-up,No reason specified
3,fffe43004900440036003200370033003400,Kyung Wanner,64,M,5YEQIF1,Town,Silver Membership,2017-08-18,Yes,CID8941,...,-999,44.57,36809.56,11.0,744.97,Yes,No,Yes,No Information Available,Too many ads
4,fffe43004900440035003000370031003900,Enola Gatto,16,F,100RYB5,Town,No Membership,2015-05-05,Yes,CID5690,...,6,349.88,40675.86,8.0,299.048351,No,Yes,Yes,Solved in Follow-up,Poor Website


In [333]:
real_test_data = transform_data(real_test_data)
real_test_data = dropAllNA(real_test_data)
real_test_data2 = real_test_data[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]

In [334]:
real_test_data2 = real_test_data2.fillna(real_test_data2.mean())

In [335]:
real_test_data2.dtypes

age                               int64
gender                           object
region_category                  object
membership_category              object
joining_date                    float64
joined_through_referral          object
preferred_offer_types            object
medium_of_operation              object
internet_option                  object
last_visit_time                   int64
days_since_last_login             int64
avg_time_spent                  float64
avg_transaction_value           float64
avg_frequency_login_days        float64
points_in_wallet                float64
used_special_discount            object
offer_application_preference     object
past_complaint                   object
complaint_status                 object
feedback                         object
dtype: object

In [336]:
encoded_str = real_test_data2.select_dtypes(include="object")#.apply(label_encoder.fit_transform)

In [337]:
model_ip = encoded_str.astype(str).apply(label_encoder.fit_transform)

In [338]:
model_ip[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value','avg_frequency_login_days']] = real_test_data2[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value','avg_frequency_login_days']].astype(str)
model_ip[['last_visit_time', 'joining_date']] = real_test_data2[['last_visit_time', 'joining_date']]
model_ip.head()

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,points_in_wallet,age,avg_time_spent,last_visit_time,joining_date,days_since_last_login,avg_transaction_value,avg_frequency_login_days
0,0,2,4,0,2,2,2,1,0,0,1,2,733.83,50,386.26,26370,305.0,12,40721.44,7.0
1,1,2,1,0,2,1,0,1,0,0,1,3,726.0,41,37.8,80476,425.0,11,9644.4,9.0
2,0,1,5,0,1,0,1,1,0,1,3,0,713.78,31,215.36,60039,792.0,18,3693.25,21.0
3,1,1,5,1,0,1,0,1,0,1,0,7,744.97,64,44.57,53777,960.0,-999,36809.56,11.0
4,0,1,2,1,2,2,1,0,1,1,3,3,299.04835123722256,16,349.88,10673,124.0,6,40675.86,8.0


In [339]:
model_ip[['last_visit_time', 'joining_date']].isna().sum()

last_visit_time    0
joining_date       0
dtype: int64

In [340]:
model_ip.dtypes

gender                            int32
region_category                   int32
membership_category               int32
joined_through_referral           int32
preferred_offer_types             int32
medium_of_operation               int32
internet_option                   int32
used_special_discount             int32
offer_application_preference      int32
past_complaint                    int32
complaint_status                  int32
feedback                          int32
points_in_wallet                 object
age                              object
avg_time_spent                   object
last_visit_time                   int64
joining_date                    float64
days_since_last_login            object
avg_transaction_value            object
avg_frequency_login_days         object
dtype: object

In [341]:
# str_needed = real_test_data.select_dtypes(exclude=["float64","int64"])
real_test_pred = model.predict(model_ip)

In [342]:
final_result_df = pd.DataFrame({
    "customer_id":real_test_data["customer_id"],
    "churn_risk_score":real_test_pred.astype(int)
})

In [343]:
final_result_df.head()

Unnamed: 0,customer_id,churn_risk_score
0,fffe43004900440031003700300030003400,3
1,fffe43004900440031003900370037003300,3
2,fffe43004900440034003800360037003000,4
3,fffe43004900440036003200370033003400,3
4,fffe43004900440035003000370031003900,5


In [344]:
final_result_df.to_csv(f"try-score-{score}.csv")

In [345]:
# import seaborn as sns
# sns.pairplot(model_ip)