## Training

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
import random as rd 
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib
import os

In [6]:
df=pd.read_csv(r'C:\Users\asus\Documents\Projects\App_Subscription_Analyser\Dataset\PreProcessedData.csv',index_col=False)
df.head(4)


Unnamed: 0,enrolled,VerifyPhone,Other,VerifyDateOfBirth,location,is_weekend_enrolleddate,VerifyCountry,Credit,numscreens,BankVerification,idscreen,VerifyMobile,Loan_all,Alerts,age,user
0,0,1,7,1,0,0,0,0,15,0,1,0,1,0,23,235136
1,0,1,5,1,1,0,1,0,13,0,0,0,1,0,24,333588
2,0,0,0,0,0,0,0,0,3,0,0,0,1,0,23,254414
3,1,1,6,1,1,0,0,3,40,0,0,0,1,0,28,234192


In [7]:
df['Alerts'].value_counts()

Alerts
0    46314
1     3686
Name: count, dtype: int64

In [8]:
df['enrolled'].value_counts()
#The data is pretty much balanced

enrolled
0    25213
1    24787
Name: count, dtype: int64

In [9]:
X=df.drop(['enrolled','user'],axis=1)
Y=df['enrolled']
X_user=df['user']

In [10]:
scale=StandardScaler()
X=scale.fit_transform(X)



In [11]:
df.shape

(50000, 16)

In [12]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.8)

#### Using random prediction

In [13]:
def random_pred(X,Y):
    res=[]
    for i in range(0,len(Y)):
        res.append(rd.choice([0,1]))
    return res


In [14]:
Y_random_pred=random_pred(X_train,Y_train)
rand_train_score=accuracy_score(Y_train,Y_random_pred)*100
rand_train_score

48.92

In [15]:
Y_random_test_pred=random_pred(X_test,Y_test)
rand_test_score=accuracy_score(Y_test,Y_random_test_pred)*100
rand_test_score

50.019999999999996

### Using logistic regression

In [16]:
log_reg=LogisticRegression()
log_reg.fit(X_train,Y_train)


In [17]:
Y_pred=log_reg.predict(X_train)
log_reg_train_score=accuracy_score(Y_train,Y_pred)*100
log_reg_train_score

78.63

In [18]:
Y_test_pred=log_reg.predict(X_test)
log_reg_test_score=accuracy_score(Y_test,Y_test_pred)*100
log_reg_test_score

78.3975

### Using SVM

In [19]:
svc=SVC(probability=True)
svc.fit(X_train,Y_train)


In [20]:
Y_train_pred_svc=svc.predict(X_train)
svc_train_score=accuracy_score(Y_train,Y_train_pred_svc)*100
svc_train_score

80.46

In [21]:
Y_test_pred_svc=svc.predict(X_test)
svc_test_score=accuracy_score(Y_test,Y_test_pred_svc)*100
svc_test_score

79.2925

### Using Decision Tree

In [22]:
dtc=DecisionTreeClassifier()
dtc.fit(X_train,Y_train)

In [23]:
Y_train_pred_dtc=dtc.predict(X_train)
dtc_train_score=accuracy_score(Y_train,Y_train_pred_dtc)*100
dtc_train_score

99.33

In [24]:
Y_test_pred_dtc=dtc.predict(X_test)
dtc_test_score=accuracy_score(Y_test,Y_test_pred_dtc)*100
dtc_test_score

70.65249999999999

### Using Random Classifier

In [25]:
rfc=RandomForestClassifier()
rfc.fit(X_train,Y_train)

In [26]:
Y_train_pred_rfc=rfc.predict(X_train)
rf_train_score=accuracy_score(Y_train,Y_train_pred_rfc)*100
rf_train_score

99.33

In [27]:
Y_test_pred_rfc=rfc.predict(X_test)
rf_test_score=accuracy_score(Y_test,Y_test_pred_rfc)*100
rf_test_score

77.3275

#### Scoreboard and observation

In [28]:
algo=["Random","Logistic Regression","SVM","Decision Tree","Random Forest"]
train_score=[rand_train_score,log_reg_train_score,svc_train_score,dtc_train_score,rf_train_score]
test_score=[rand_test_score,log_reg_test_score,svc_test_score,dtc_test_score,rf_test_score]
remark=["Random predictor","Good working","Score improved and best fit","Overfit","Overfit"]
tab={"Algorithm":algo,"Train Score":train_score,"Test Score":test_score,"Remarks":remark}
scoreboard=pd.DataFrame(tab)
scoreboard

Unnamed: 0,Algorithm,Train Score,Test Score,Remarks
0,Random,48.92,50.02,Random predictor
1,Logistic Regression,78.63,78.3975,Good working
2,SVM,80.46,79.2925,Score improved and best fit
3,Decision Tree,99.33,70.6525,Overfit
4,Random Forest,99.33,77.3275,Overfit


<p> Hence, we're going to consider the SVM model for prediction </p>

In [29]:
model_path=os.path.dirname(os.getcwd())+'\Models\svc_model.pkl'
scale_path=os.path.dirname(os.getcwd())+'\Models\scale.pkl'
joblib.dump(svc,filename=model_path)
joblib.dump(scale,filename=scale_path)

['c:\\Users\\asus\\Documents\\Projects\\App_Subscription_Analyser\\Models\\scale.pkl']

In [30]:
df.columns

Index(['enrolled', 'VerifyPhone', 'Other', 'VerifyDateOfBirth', 'location',
       'is_weekend_enrolleddate', 'VerifyCountry', 'Credit', 'numscreens',
       'BankVerification', 'idscreen', 'VerifyMobile', 'Loan_all', 'Alerts',
       'age', 'user'],
      dtype='object')

In [31]:
## Concatenating the dataset
train_preds=pd.DataFrame(svc.predict(X_train),columns=['Enrolled?'])
test_preds=pd.DataFrame(svc.predict(X_test),columns=['Enrolled?'])
train_preds_prob=pd.DataFrame(svc.predict_proba(X_train),columns=['Non Enroll score','Enroll score'])
test_preds_prob=pd.DataFrame(svc.predict_proba(X_test),columns=['Non Enroll score','Enroll score'])
df_train=pd.concat([train_preds,train_preds_prob],axis=1)
df_test=pd.concat([test_preds,test_preds_prob],axis=1)

In [32]:
df_train_test=pd.concat([df_train,df_test],axis=0).reset_index(drop=True)
df_user=pd.DataFrame(X_user.values,columns=['User']).reset_index(drop=True)
df_train_test.head(6)


Unnamed: 0,Enrolled?,Non Enroll score,Enroll score
0,1,0.138314,0.861686
1,0,0.819935,0.180065
2,0,0.825774,0.174226
3,1,0.157385,0.842615
4,1,0.303914,0.696086
5,1,0.130262,0.869738


In [33]:
df_full=df_user.join(df_train_test)
df_full.head(5)

Unnamed: 0,User,Enrolled?,Non Enroll score,Enroll score
0,235136,1,0.138314,0.861686
1,333588,0,0.819935,0.180065
2,254414,0,0.825774,0.174226
3,234192,1,0.157385,0.842615
4,51549,1,0.303914,0.696086


In [34]:
pred_path=os.path.dirname(os.getcwd())+'\Dataset\predicted_result.csv'
df_full.to_csv(pred_path,index=False)