In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('dataset/train.csv')

In [3]:
df.shape

(36992, 25)

In [4]:
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

### Negative Churn Rate Fixing

In [5]:
# Cleaning the Null and Negative values
# df = df[df["churn_risk_score"]>=0]

# changinng negative values to zero
# df["churn_risk_score"] = np.where((df["churn_risk_score"] < 0),0,df["churn_risk_score"])
df[df["churn_risk_score"]<0]["churn_risk_score"] = int(df["churn_risk_score"].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df["churn_risk_score"]<0]["churn_risk_score"] = int(df["churn_risk_score"].mean())


### joining_date_col convert to days to start

In [6]:
# Joining Date Columns Converted to Int
joining_date_col = pd.to_datetime(df["joining_date"])

In [7]:
df["joining_date"] = pd.to_datetime(df["joining_date"])  - joining_date_col.min()

In [8]:
df["joining_date"].value_counts()

152 days    55
184 days    51
171 days    50
580 days    49
176 days    49
            ..
440 days    19
914 days    18
519 days    18
254 days    18
62 days     16
Name: joining_date, Length: 1096, dtype: int64

In [9]:
# last_visit_time col here
pd.to_datetime(df["last_visit_time"],format="%H:%M:%S")[0:20]

0    1900-01-01 16:08:02
1    1900-01-01 12:38:13
2    1900-01-01 22:53:21
3    1900-01-01 15:57:50
4    1900-01-01 15:46:44
5    1900-01-01 06:46:07
6    1900-01-01 11:40:04
7    1900-01-01 07:52:43
8    1900-01-01 06:50:10
9    1900-01-01 19:10:16
10   1900-01-01 05:57:20
11   1900-01-01 11:56:11
12   1900-01-01 04:42:50
13   1900-01-01 08:35:05
14   1900-01-01 21:01:43
15   1900-01-01 13:48:22
16   1900-01-01 20:58:28
17   1900-01-01 03:42:26
18   1900-01-01 15:39:05
19   1900-01-01 11:46:54
Name: last_visit_time, dtype: datetime64[ns]

### last_visit_ns -> Last Visit Column to Float

In [10]:
# pd.TimeStamp(pd.to_datetime(df["last_visit_time"],format="%H:%M:%S"))
last_visit_ns = pd.to_datetime(df["last_visit_time"],format="%H:%M:%S")

In [11]:
ts = (last_visit_ns - np.datetime64('1900-01-01T00:00:00Z')) / np.timedelta64(1, 's')

  ts = (last_visit_ns - np.datetime64('1900-01-01T00:00:00Z')) / np.timedelta64(1, 's')


In [12]:
ts

0        58082.0
1        45493.0
2        82401.0
3        57470.0
4        56804.0
          ...   
36987    15245.0
36988    83911.0
36989    13825.0
36990    35403.0
36991     5992.0
Name: last_visit_time, Length: 36992, dtype: float64

In [13]:
df["last_visit_time"] = ts

In [14]:
df = df.dropna()

In [15]:
df.shape

(28373, 25)

In [16]:
df.isna().count()

customer_id                     28373
Name                            28373
age                             28373
gender                          28373
security_no                     28373
region_category                 28373
membership_category             28373
joining_date                    28373
joined_through_referral         28373
referral_id                     28373
preferred_offer_types           28373
medium_of_operation             28373
internet_option                 28373
last_visit_time                 28373
days_since_last_login           28373
avg_time_spent                  28373
avg_transaction_value           28373
avg_frequency_login_days        28373
points_in_wallet                28373
used_special_discount           28373
offer_application_preference    28373
past_complaint                  28373
complaint_status                28373
feedback                        28373
churn_risk_score                28373
dtype: int64

In [17]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [18]:
df.dtypes

customer_id                              object
Name                                     object
age                                       int64
gender                                   object
security_no                              object
region_category                          object
membership_category                      object
joining_date                    timedelta64[ns]
joined_through_referral                  object
referral_id                              object
preferred_offer_types                    object
medium_of_operation                      object
internet_option                          object
last_visit_time                         float64
days_since_last_login                     int64
avg_time_spent                          float64
avg_transaction_value                   float64
avg_frequency_login_days                 object
points_in_wallet                        float64
used_special_discount                    object
offer_application_preference            

In [19]:
df2 = df.select_dtypes(include="object").apply(label_encoder.fit_transform)

In [20]:
df2.head()

Unnamed: 0,customer_id,Name,gender,security_no,region_category,membership_category,joined_through_referral,referral_id,preferred_offer_types,medium_of_operation,internet_option,avg_frequency_login_days,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,24720,22051,0,26668,2,3,1,9603,1,0,2,570,1,1,0,1,4
2,4529,19480,0,1144,1,2,2,405,1,2,2,598,0,1,1,3,3
3,24843,8084,1,24890,0,2,2,4654,1,2,1,1251,0,1,1,4,3
4,4653,19011,0,22828,0,2,1,9603,0,3,1,567,0,1,1,2,3
5,26213,15752,1,20385,0,1,1,9603,1,0,2,601,1,0,1,4,0


In [33]:
from sklearn.model_selection import train_test_split
X = df2[['gender',
       'region_category', 'membership_category', 
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option',
       'avg_frequency_login_days', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]
X[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']] = df[['points_in_wallet', 'age', 'avg_time_spent', 'last_visit_time', 'joining_date', 'days_since_last_login', 'avg_transaction_value']]
y = df["churn_risk_score"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X.shape,y.shape)

(28373, 20) (28373,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [36]:
X.head()

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,avg_frequency_login_days,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,points_in_wallet,age,avg_time_spent,last_visit_time,joining_date,days_since_last_login,avg_transaction_value
0,0,2,3,1,1,0,2,570,1,1,0,1,4,781.75,18,300.63,58082.0,959 days,17,53005.25
2,0,1,2,2,1,2,2,598,0,1,1,3,3,500.69,44,516.16,82401.0,680 days,14,21027.0
3,1,0,2,2,1,2,1,1251,0,1,1,4,3,567.66,37,53.27,57470.0,667 days,11,25239.56
4,0,0,2,1,0,3,1,567,0,1,1,2,3,663.06,31,113.13,56804.0,985 days,20,24483.66
5,1,0,1,1,1,0,2,601,1,0,1,4,0,722.27,13,433.62,24367.0,372 days,23,13884.77


In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [35]:
model.fit(X_train,y_train)

ValueError: could not convert string to float: 'F'

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score,mean_squared_error
score = mean_squared_error(y_test, y_pred)
score

In [None]:
result_df = pd.DataFrame({
    "churn_risk_score":y_pred.astype(int)
})

In [None]:
result_df

In [None]:
real_test_data = pd.read_csv("dataset/test.csv")
real_test_data.head()

In [None]:
real_test_data2 = real_test_data[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]

In [None]:
real_test_data2.dtypes

In [None]:
# str_needed = real_test_data.select_dtypes(exclude=["float64","int64"])
encoded = real_test_data2.astype(str).apply(label_encoder.fit_transform)
real_test_pred = model.predict(encoded)
encoded.head()

In [None]:
final_result_df = pd.DataFrame({
    "customer_id":real_test_data["customer_id"],
    "churn_risk_score":real_test_pred.astype(int)
})

In [None]:
final_result_df.head()

In [None]:
final_result_df.to_csv("try-1.csv")