In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv('dataset/train.csv')

In [25]:
df.shape

(36992, 25)

In [26]:
df.columns

Index(['customer_id', 'Name', 'age', 'gender', 'security_no',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'referral_id', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback', 'churn_risk_score'],
      dtype='object')

In [27]:
# Cleaning the Null and Negative values
# df = df[df["churn_risk_score"]>=0]

# changinng negative values to zero
df["churn_risk_score"] = np.where((df["churn_risk_score"] < 0),0,df["churn_risk_score"])

In [28]:
df = df.dropna()

In [29]:
df.shape

(28373, 25)

In [30]:
df.isna().count()

customer_id                     28373
Name                            28373
age                             28373
gender                          28373
security_no                     28373
region_category                 28373
membership_category             28373
joining_date                    28373
joined_through_referral         28373
referral_id                     28373
preferred_offer_types           28373
medium_of_operation             28373
internet_option                 28373
last_visit_time                 28373
days_since_last_login           28373
avg_time_spent                  28373
avg_transaction_value           28373
avg_frequency_login_days        28373
points_in_wallet                28373
used_special_discount           28373
offer_application_preference    28373
past_complaint                  28373
complaint_status                28373
feedback                        28373
churn_risk_score                28373
dtype: int64

In [31]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

In [32]:
df = df.apply(label_encoder.fit_transform)

In [33]:
df.head()

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback,churn_risk_score
0,24720,22051,8,0,26668,2,3,959,1,9603,...,12959,26211,570,16395,1,1,0,1,4,2
2,4529,19480,34,0,1144,1,2,680,2,405,...,16721,10720,598,2370,0,1,1,3,3,5
3,24843,8084,27,1,24890,0,2,667,2,4654,...,2974,12927,1251,4376,0,1,1,4,3,5
4,4653,19011,21,0,22828,0,2,985,1,9603,...,6218,12489,567,8914,0,1,1,2,3,5
5,26213,15752,3,1,20385,0,1,372,1,9603,...,15623,6937,601,12704,1,0,1,4,0,3


In [34]:
from sklearn.model_selection import train_test_split
X = df[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]
y = df["churn_risk_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [35]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [36]:
model.fit(X_train,y_train)

LinearRegression()

In [37]:
y_pred = model.predict(X_test)

In [38]:
from sklearn.metrics import f1_score,mean_squared_error
score = mean_squared_error(y_test, y_pred)
score

1.1406149430980415

In [39]:
result_df = pd.DataFrame({
    "churn_risk_score":y_pred.astype(int)
})

In [40]:
result_df

Unnamed: 0,churn_risk_score
0,3
1,3
2,3
3,3
4,5
...,...
9359,3
9360,2
9361,2
9362,3


In [41]:
real_test_data = pd.read_csv("dataset/test.csv")
real_test_data.head()

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,fffe43004900440031003700300030003400,Alethia Meints,50,F,OQJ1XAY,Village,Premium Membership,2015-11-02,No,xxxxxxxx,...,12,386.26,40721.44,7.0,733.83,Yes,No,No,Not Applicable,Poor Product Quality
1,fffe43004900440031003900370037003300,Ming Lopez,41,M,OUQRPKO,Village,Gold Membership,2016-03-01,No,xxxxxxxx,...,11,37.8,9644.4,9.0,726.0,Yes,No,No,Not Applicable,Poor Website
2,fffe43004900440034003800360037003000,Carina Flannigan,31,F,02J2RE7,Town,Silver Membership,2017-03-03,No,xxxxxxxx,...,18,215.36,3693.25,21.0,713.78,Yes,No,Yes,Solved in Follow-up,No reason specified
3,fffe43004900440036003200370033003400,Kyung Wanner,64,M,5YEQIF1,Town,Silver Membership,2017-08-18,Yes,CID8941,...,-999,44.57,36809.56,11.0,744.97,Yes,No,Yes,No Information Available,Too many ads
4,fffe43004900440035003000370031003900,Enola Gatto,16,F,100RYB5,Town,No Membership,2015-05-05,Yes,CID5690,...,6,349.88,40675.86,8.0,299.048351,No,Yes,Yes,Solved in Follow-up,Poor Website


In [42]:
real_test_data2 = real_test_data[['age', 'gender',
       'region_category', 'membership_category', 'joining_date',
       'joined_through_referral', 'preferred_offer_types',
       'medium_of_operation', 'internet_option', 'last_visit_time',
       'days_since_last_login', 'avg_time_spent', 'avg_transaction_value',
       'avg_frequency_login_days', 'points_in_wallet', 'used_special_discount',
       'offer_application_preference', 'past_complaint', 'complaint_status',
       'feedback']]

In [43]:
real_test_data2.dtypes

age                               int64
gender                           object
region_category                  object
membership_category              object
joining_date                     object
joined_through_referral          object
preferred_offer_types            object
medium_of_operation              object
internet_option                  object
last_visit_time                  object
days_since_last_login             int64
avg_time_spent                  float64
avg_transaction_value           float64
avg_frequency_login_days         object
points_in_wallet                float64
used_special_discount            object
offer_application_preference     object
past_complaint                   object
complaint_status                 object
feedback                         object
dtype: object

In [44]:
# str_needed = real_test_data.select_dtypes(exclude=["float64","int64"])
encoded = real_test_data2.astype(str).apply(label_encoder.fit_transform)
real_test_pred = model.predict(encoded)
encoded.head()

Unnamed: 0,age,gender,region_category,membership_category,joining_date,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,last_visit_time,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,40,0,2,4,305,1,2,3,2,5375,4,9699,12556,936,10445,1,0,0,1,2
1,31,1,2,1,425,1,2,2,0,16603,3,9400,19623,943,10110,1,0,0,1,3
2,21,0,1,5,792,1,1,1,1,12359,10,5667,10979,458,9476,1,0,1,3,0
3,54,1,1,5,960,2,0,0,0,11087,0,10635,10935,439,10903,1,0,1,0,7
4,6,0,1,2,124,2,2,3,1,2190,23,8955,12529,942,1763,0,1,1,3,3


In [45]:
final_result_df = pd.DataFrame({
    "customer_id":real_test_data["customer_id"],
    "churn_risk_score":real_test_pred.astype(int)
})

In [46]:
final_result_df.head()

Unnamed: 0,customer_id,churn_risk_score
0,fffe43004900440031003700300030003400,3
1,fffe43004900440031003900370037003300,3
2,fffe43004900440034003800360037003000,3
3,fffe43004900440036003200370033003400,2
4,fffe43004900440035003000370031003900,4


In [47]:
final_result_df.to_csv("try-1.csv")