In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline

In [4]:
#Reading CSV
db = pd.read_csv("data.csv")
print("-----Viewing dataset-----")
#Viewing dataset
print("\nPrinting first 5 rows from dataset")
print(db.head(5))
db = db.iloc[0:10001 , 1:11] #Remove 1st column bz it is not needed
print("\nNo of rows and columns in dataset are -")
print(db.shape)                                                          
print("\nDatatypes of each feature -")
print(db.dtypes) 

-----Viewing dataset-----

Printing first 5 rows from dataset
   Unnamed: 0  IsVIP_500  payment_7_day  dau_days  \
0           0          0          15.98         6   
1           1          0           8.48         7   
2           2          0           3.49         7   
3           3          0           3.99         7   
4           4          0          38.00         3   

   days_between_install_first_pay  total_txns_7_day  total_page_views  \
0                               8                 3               266   
1                             372                 3               484   
2                             439                 2               504   
3                             570                 2               513   
4                            1741                 4                31   

   total_product_liked  product_like_rate  total_free_coupon_got  \
0                   95           0.357143                      9   
1                  118           0.243802   

In [None]:
#0=>Non VIP , 1=>VIP
check = db.apply(lambda x: True if x['IsVIP_500'] == 0 else False , axis=1)
nonvip = len(check[check == True].index)
print('\n\nNumber of Rows in dataframe which are Non-VIP : ', nonvip)
totalrows = db.shape[0]
print('\nNumber of Rows in dataframe which are VIP : ' , (totalrows-nonvip))

In [None]:
#seperate the features and target
X = db.iloc[:,1:10] #9 features
y = db.iloc[:,0] #1 target

In [None]:
#Data cleaning
print("\n\tCheck: Any values are NULL :")
print(X.isnull().sum())
print("\n\tCheck: All values are FINITE :")
print(np.isfinite(db).sum())
print("\n\tCheck: NaN values :")
print(np.isnan(X).sum())
#40 infinite values in product_like_rate convert it into finite
X = X.replace([np.inf,-np.inf], 0) #Replace any infinite number with NaN
X = X.replace(np.nan, 0)  

In [None]:
print("\n-----Visualisation-----")
#Visualisation
#Pie chart
labels = ['Vip','Non-Vip']
sizes = ((totalrows-nonvip),nonvip)
colors = ['gold', 'red']
plt.pie(sizes, colors=colors,autopct='%1.1f%%', pctdistance=1.1, labeldistance=1.2,shadow=True,startangle=90)
plt.legend(labels, loc="best")
plt.title("\nPie chart -> Distribution of majority and minority classes")
plt.tight_layout()
plt.show()
print("VIP's are very less therefore,data is imbalanced")
#Histogram
No_of_active_days = X.iloc[:,2]
No_of_active_days.plot(kind='hist', bins=30)
plt.xlabel("No_of_active_days")
plt.title("\nHistogram -> No. of active days")
plt.tight_layout()
plt.show()
#Bar chart
Days_of_Customer_Login = X.iloc[:,1]
print("\nFrequency of Days of Customer Login\n")
print(Days_of_Customer_Login.value_counts()) 
Days_of_Customer_Login.value_counts().plot(kind='bar')
plt.xlabel("Days of Customer Login")
plt.title("\nBar chart -> Customer login")
plt.tight_layout()
plt.show()
#Scatter plot
plt.scatter(db["payment_7_day"], y)
plt.xlabel("payments")
plt.ylabel("VIP")
plt.title("\nScatter plot -> Payment made in 7 days")
plt.tight_layout()
plt.show()

In [None]:
print("\n-----Splitting dataset-----")
#Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1)

In [None]:
#-------Resampling-------using SMOTE Algorithm
print('Before Resampling VIP(1): {}'.format(sum(y_train == 1))) 
print('Before Resampling Non-VIP(0): {} \n'.format(sum(y_train == 0))) 

# import SMOTE module from imblearn library 
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 

print('After Resampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After Resampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print('After Resampling, counts of VIP(1): {}'.format(sum(y_train_res == 1))) 
print('After Resampling, counts of Non-VIP(0): {} \n'.format(sum(y_train_res == 0))) 

In [None]:
#----------------------------
print("\nKNN classification technique-\n")
scaler = StandardScaler()
scaler.fit(X_train)
X_train_res = scaler.transform(X_train_res)
X_test = scaler.transform(X_test)
#Training and Predictions
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train_res, y_train_res)
y_pred = classifier.predict(X_test)
print(y_test)
print(y_pred)
#Evaluating the Algorithm
print(confusion_matrix(y_test, y_pred))
knn_acc = accuracy_score(y_test, y_pred)
print("Accuracy:",round(accuracy_score(y_test, y_pred)*100,3),"%")
print(classification_report(y_test, y_pred,target_names=['NON-VIP', 'VIP']))

In [None]:
print("\nLogistic regression-\n")
#Instantiate the model (using the default parameters)
logreg = LogisticRegression()
# fit the model with data
logreg.fit(X_train_res,y_train_res)
y_pred=logreg.predict(X_test)
print(y_test)
print(y_pred)
#Evaluating the Algorithm
print("Confusion matrix:\n",confusion_matrix(y_test,y_pred))
log_acc = accuracy_score(y_test, y_pred)
print("Accuracy:",round(accuracy_score(y_test, y_pred)*100,3),"%")
print(classification_report(y_test, y_pred,target_names=['NON-VIP', 'VIP']))

In [None]:
print("\nDecision tree-\n")
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_res,y_train_res)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print(y_test)
print(y_pred)
#Evaluating the Algorithm
print("Confusion matrix:\n",confusion_matrix(y_test,y_pred))
desc_acc = accuracy_score(y_test, y_pred)
print("Accuracy:",round(accuracy_score(y_test, y_pred)*100,3),"%")
print(classification_report(y_test, y_pred,target_names=['NON-VIP', 'VIP']))

In [None]:
#Comparing accuracy
if (knn_acc>=log_acc):
  if(knn_acc>=desc_acc):
    print("Accuracy of KNN classifier is maximum among all which is :", round((knn_acc*100),3),"%")
  else:
    print("Accuracy of Decision tree is maximum among all which is :", round((desc_acc*100),3),"%")
else:
  if(log_acc>=desc_acc):
    print("Accuracy of Logistic Regression is maximum among all which is :", round((log_acc*100),3),"%")
  else:
    print("Accuracy of Decision tree is maximum among all which is :",round((desc_acc*100),3),"%\n")

In [None]:
#Comparision graph
label = ['%.2f per\nKNN' %(round((knn_acc*100),3)),'%.2f per\nLogistic\nRegression' %(round((log_acc*100),3)),'%.2f per\nDecision tree'%(round((desc_acc*100),3))]
acc = [round((knn_acc*100),3),round((log_acc*100),3),round((desc_acc*100),3)]
plt.bar(index, acc,align='center',color=(0.2, 0.4, 0.6, 0.6),width=0.45)
plt.xlabel('Classifiers', fontsize=15)
plt.ylabel('Accuracy in %', fontsize=15)
plt.xticks(index, label, fontsize=14,color='b')
plt.title('Classification technique Accuracy comparision',fontsize=17,color='g')
plt.tight_layout()
plt.show()