In [608]:
import pandas as pd
import numpy as np

In [609]:
df = pd.read_csv('dataset/train.csv')

In [610]:
df.shape

(36992, 25)

In [611]:
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [612]:
def _joined_through_ref_cleaner(row):
    yesOrNo = 'Yes' if row["referral_id"] != 'xxxxxxxx' else 'No'
    row['joined_through_referral'] = yesOrNo if (row['joined_through_referral'] == '?') else row['joined_through_referral']
    return row

def joined_through_ref_cleaner(df):
    df2 = df.apply(lambda row:_joined_through_ref_cleaner(row),axis=1)
    return df2

In [613]:
def dropAllNA(df):
    df["gender"] = df["gender"].replace("Unknown",np.NaN)
#     df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("Error",np.NaN)
#     df["avg_frequency_login_days"] = df["avg_frequency_login_days"].replace("-999",50)
#     df["medium_of_operation"] = df["medium_of_operation"].replace("?",np.NaN)
    
    df['gender'].fillna(df["gender"].value_counts().index[0],inplace=True)
#     df['avg_frequency_login_days'].fillna(df[df["avg_frequency_login_days"] != '-999']['avg_frequency_login_days'].astype('float64').mean(),inplace=True)
#     df['medium_of_operation'].fillna(df["medium_of_operation"].value_counts().index[0],inplace=True)
    df['region_category'].fillna(df["region_category"].value_counts().index[0],inplace=True)
    df['preferred_offer_types'].fillna(df["preferred_offer_types"].value_counts().index[0],inplace=True)
    
    df['points_in_wallet'].fillna(df["points_in_wallet"].astype('float64').mean(),inplace=True)
    df['points_in_wallet'] = df['points_in_wallet'].astype('float64')
#     df['avg_frequency_login_days'] = df['avg_frequency_login_days'].astype("float64")
    return df

In [614]:
df = dropAllNA(df)

### Negative Churn Rate Fixing

In [615]:
# Cleaning the Null and Negative values
# df = df[df["churn_risk_score"]>=0]

# changinng negative values to zero
# df["churn_risk_score"] = np.where((df["churn_risk_score"] < 0),0,df["churn_risk_score"])
df[(df["churn_risk_score"]<0)]["churn_risk_score"] = int(0)
df[df["churn_risk_score"] > 5]["churn_risk_score"] = int(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[(df["churn_risk_score"]<0)]["churn_risk_score"] = int(0)


### joining_date_col convert to days to start

In [616]:
# Joining Date Columns Converted to Int
joining_date_col = pd.to_datetime(df["joining_date"])

In [617]:
df["joining_date"] = pd.to_datetime(df["joining_date"])  - joining_date_col.min()

In [618]:
# df["joining_date"].value_counts()
df["joining_date"] = df["joining_date"] / np.timedelta64(1, 'D')
df["joining_date"][0:10]

0    959.0
1    970.0
2    680.0
3    667.0
4    985.0
5    372.0
6     77.0
7    558.0
8    713.0
9    699.0
Name: joining_date, dtype: float64

In [619]:
# last_visit_time col here
pd.to_datetime(df["last_visit_time"],format="%H:%M:%S")[0:20]

0    1900-01-01 16:08:02
1    1900-01-01 12:38:13
2    1900-01-01 22:53:21
3    1900-01-01 15:57:50
4    1900-01-01 15:46:44
5    1900-01-01 06:46:07
6    1900-01-01 11:40:04
7    1900-01-01 07:52:43
8    1900-01-01 06:50:10
9    1900-01-01 19:10:16
10   1900-01-01 05:57:20
11   1900-01-01 11:56:11
12   1900-01-01 04:42:50
13   1900-01-01 08:35:05
14   1900-01-01 21:01:43
15   1900-01-01 13:48:22
16   1900-01-01 20:58:28
17   1900-01-01 03:42:26
18   1900-01-01 15:39:05
19   1900-01-01 11:46:54
Name: last_visit_time, dtype: datetime64[ns]

### last_visit_ns -> Last Visit Column to Float

In [620]:
# pd.TimeStamp(pd.to_datetime(df["last_visit_time"],format="%H:%M:%S"))
last_visit_ns = pd.to_datetime(df["last_visit_time"],format="%H:%M:%S")

In [621]:
ts = (last_visit_ns - np.datetime64('1900-01-01T00:00:00Z')) / np.timedelta64(1, 's')

  ts = (last_visit_ns - np.datetime64('1900-01-01T00:00:00Z')) / np.timedelta64(1, 's')


In [622]:
ts

0        58082.0
1        45493.0
2        82401.0
3        57470.0
4        56804.0
          ...   
36987    15245.0
36988    83911.0
36989    13825.0
36990    35403.0
36991     5992.0
Name: last_visit_time, Length: 36992, dtype: float64

In [623]:
df["last_visit_time"] = ts

In [624]:
df = df.dropna()

In [625]:
df.shape

(36992, 25)

In [626]:
df.isna().count()

customer_id                     36992
Name                            36992
age                             36992
gender                          36992
security_no                     36992
region_category                 36992
membership_category             36992
joining_date                    36992
joined_through_referral         36992
referral_id                     36992
preferred_offer_types           36992
medium_of_operation             36992
internet_option                 36992
last_visit_time                 36992
days_since_last_login           36992
avg_time_spent                  36992
avg_transaction_value           36992
avg_frequency_login_days        36992
points_in_wallet                36992
used_special_discount           36992
offer_application_preference    36992
past_complaint                  36992
complaint_status                36992
feedback                        36992
churn_risk_score                36992
dtype: int64

In [627]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [628]:
df.dtypes

customer_id                      object
Name                             object
age                               int64
gender                           object
security_no                      object
region_category                  object
membership_category              object
joining_date                    float64
joined_through_referral          object
referral_id                      object
preferred_offer_types            object
medium_of_operation              object
internet_option                  object
last_visit_time                 float64
days_since_last_login             int64
avg_time_spent                  float64
avg_transaction_value           float64
avg_frequency_login_days         object
points_in_wallet                float64
used_special_discount            object
offer_application_preference     object
past_complaint                   object
complaint_status                 object
feedback                         object
churn_risk_score                  int64


In [629]:
df2 = df.select_dtypes(include="object").astype(str).apply(label_encoder.fit_transform)

In [630]:
df2.head()

Unnamed: 0,customer_id,Name,gender,security_no,region_category,membership_category,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,internet_option,avg_frequency_login_days,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,32243,28675,0,34811,2,3,1,11358,1,0,2,746,1,1,0,1,4
1,7055,34951,0,5804,0,4,0,2225,1,2,1,732,1,0,1,2,5
2,5910,25315,0,1534,1,2,2,471,1,2,2,779,0,1,1,3,3
3,32400,10462,1,32501,0,2,2,5506,1,2,1,1637,0,1,1,4,3
4,6070,24704,0,29819,0,2,1,11358,0,3,1,743,0,1,1,2,3


In [631]:
from sklearn.model_selection import train_test_split
X = df2[['gender',
       'region_category', 'membership_category', 
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option',
       'avg_frequency_login_days', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]
X[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']] = df[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']]
y = df["churn_risk_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=465)
# print(X.shape,y.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [632]:
X.head()

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,avg_frequency_login_days,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,points_in_wallet,age,avg_time_spent,last_visit_time,joining_date,days_since_last_login,avg_transaction_value
0,0,2,3,1,1,0,2,746,1,1,0,1,4,781.75,18,300.63,58082.0,959.0,17,53005.25
1,0,0,4,0,1,2,1,732,1,0,1,2,5,686.882199,32,306.34,45493.0,970.0,16,12838.38
2,0,1,2,2,1,2,2,779,0,1,1,3,3,500.69,44,516.16,82401.0,680.0,14,21027.0
3,1,0,2,2,1,2,1,1637,0,1,1,4,3,567.66,37,53.27,57470.0,667.0,11,25239.56
4,0,0,2,1,0,3,1,743,0,1,1,2,3,663.06,31,113.13,56804.0,985.0,20,24483.66


In [633]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score,mean_squared_error
score = f1_score(y_test, y_pred,average="macro")
score

In [None]:
result_df = pd.DataFrame({
    "churn_risk_score":y_pred.astype(int)
})

In [None]:
result_df

In [None]:
real_test_data = pd.read_csv("dataset/test.csv")
real_test_data.head()

In [None]:
real_test_data = dropAllNA(real_test_data)

In [None]:
real_test_data2 = real_test_data[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]


In [None]:
real_test_data2 = real_test_data2.fillna(real_test_data2.mean())

In [None]:
# joining_date column conversions

In [None]:
# Joining Date Columns Converted to Int
joining_date_col = pd.to_datetime(real_test_data2["joining_date"])

In [None]:
real_test_data2["joining_date"] = pd.to_datetime(real_test_data2["joining_date"])  - joining_date_col.min()

In [None]:
# df["joining_date"].value_counts()
real_test_data2["joining_date"] = real_test_data2["joining_date"] / np.timedelta64(1, 'D')
real_test_data2["joining_date"][0:10]

In [None]:
real_test_data2.dtypes

### Last Visit Time Fix for Test

In [None]:
# pd.TimeStamp(pd.to_datetime(df["last_visit_time"],format="%H:%M:%S"))
last_visit_ns = pd.to_datetime(real_test_data2["last_visit_time"],format="%H:%M:%S")

In [None]:
ts = (last_visit_ns - np.datetime64('1900-01-01T00:00:00Z')) / np.timedelta64(1, 's')

In [None]:
ts

In [None]:
real_test_data2["last_visit_time"] = ts

In [None]:
real_test_data2.dtypes

In [None]:
encoded_str = real_test_data2.select_dtypes(include="object")#.apply(label_encoder.fit_transform)

In [None]:
model_ip = encoded_str.astype(str).apply(label_encoder.fit_transform)

In [None]:
model_ip[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']] = real_test_data2[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']]
model_ip[['last_visit_time', 'joining_date']] = real_test_data2[['last_visit_time', 'joining_date']]
model_ip.head()

In [None]:
model_ip[['last_visit_time', 'joining_date']].isna().sum()

In [None]:
model_ip.dtypes

In [None]:
# str_needed = real_test_data.select_dtypes(exclude=["float64","int64"])
real_test_pred = model.predict(model_ip)

In [None]:
final_result_df = pd.DataFrame({
    "customer_id":real_test_data["customer_id"],
    "churn_risk_score":real_test_pred.astype(int)
})

In [None]:
final_result_df.head()

In [None]:
final_result_df.to_csv(f"Model-{score}.csv")

In [None]:
print(score)