# Shopping Flag and Probability model

### Preprocessing Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv('Preprocessing.csv')
data.head()

In [None]:
data_origin= pd.get_dummies(data)
data_origin.head()
#transfer 

In [None]:
from sklearn,preprocessing import Imputer
data[data='?']=np.nan # convert '?' to NaN
print(data.isnull().sum()) # the number of NaN
#impute missing data

### Analysis

In [None]:
Customer= pd.read_csv('Logistics.csv')
Customer.head()
# 1-FEMALE, 2-MALE, 3-UNKNOWN

In [None]:
Customer.info() #show lost value of attribute

In [None]:
Customer.describe()

In [None]:
import matplotlib.pyplot as plt
fig=plt.figure()
fig.set(alpha = 0.2) # set fiture color

Customer.next_12mo_buyer_flg.value_counts().plot(kind="bar")
plt.title('Buyer_Flag in 12 months')
#next 12 month buyer_flag

In [None]:
plt.subplot2grid((2,3),(0,0))
plt.hist(Customer.NTH_ORDER, bins=100, color='red')
plt.title('Nth Order')
plt.ylabel('Amount')
plt.xlim(0,50)
# Nth Order

plt.subplot2grid((2,3),(0,1),colspan=2)
plt.hist(Customer.TENURE_IN_DAYS, bins=20, edgecolor='None', color='green')
plt.title('Tenure')
# Tenure in days

plt.subplot2grid((2,3),(1,0))
plt.scatter(Customer.next_12mo_buyer_flg, Customer.LIFETIME_REVENUE, color='orange')
plt.title('Lifetime Revenue')
# Lifetime Revenue

plt.subplot2grid((2,3),(1,1))
plt.scatter(Customer.next_12mo_buyer_flg, Customer.AVG_LAG)
plt.title('Average Lag')
# Average Lag

plt.subplot2grid((2,3),(1,2))
plt.scatter(Customer.next_12mo_buyer_flg, Customer.DPT, color='purple')
plt.title('DPT')
# Dollar per transaction

In [None]:
Customer.UPT[Customer.GENDER == 1].plot(kind='kde')   
Customer.UPT[Customer.GENDER == 2].plot(kind='kde')
Customer.UPT[Customer.GENDER == 0].plot(kind='kde')
plt.xlabel('UPT')
plt.ylabel('Condensity')
plt.legend(('FEMALE','MALE','UNKNOWN'))
# UPT with gender

### Logistic Regression Model

In [None]:
Customer_L = pd.read_csv('Logistics.csv')
Customer_L.head()

In [None]:
Customer_L.set_index('CUSTOMER_DIM_ID', inplace=True)
Customer_L.head()

In [None]:
Customer_L=pd.DataFrame(Customer_L)
y_L=Customer_L['next_12mo_buyer_flg']
X_L=Customer_L.drop('next_12mo_buyer_flg', axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train_L, X_test_L, y_train_L, y_test_L = train_test_split(X_L, y_L, test_size=0.2, random_state = 42)
logreg = LogisticRegression()
logreg.fit(X_train_L, y_train_L)
y_pred_L = logreg.predict(X_test_L)
y_pred_proba_L = logreg.predict_proba(X_test_L)

In [None]:

X_test_L=pd.DataFrame(X_test_L)
result_L=pd.DataFrame(y_pred_proba_L)
result_L.columns = ['flag = 0', 'flag = 1']
result_L.index=X_test_L.index
df_out = pd.concat([X_test_L,result_L], axis=1)
df_out=df_out.sort_values(by='flag = 1')

In [None]:
df_out.head()

In [None]:
df_out.to_csv("Logistics_predictions.csv")

In [None]:
logreg.score(X_test_L, y_test_L)

#### ROC Curve - Threshold

In [None]:
from sklearn.metrics import roc_curve
y_pred_proba_L = logreg.predict_proba(X_test_L)[:,1]
fpr, tpr, thresholds = roc_curve(y_test_L, y_pred_proba_L)

In [None]:
plt.plot([0,1], [0,1], 'k--')
plt.plot(fpr, tpr, label = 'Logistic Regression')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression ROC Curve')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test_L, y_pred_proba_L)
# Larger area under ROC curve = better model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test_L, y_pred_L))
print(classification_report(y_test_L, y_pred_L))

# precision = TP/(TP+FP)
# recall=TP/(TP+FN)


# precision - fraction of retrieved instances that are relevant
# recall - fraction of relevant instances that are retrieved
# F1 score - measure of a test's accuracy
# support - count of testes samples

### K-Nearest Neighbors Model

In [None]:
Customer_K = pd.read_csv('K-Nearest Neighbors.csv', index_col=0)
Customer_K.head()

y_K=Customer_K['next_12mo_buyer_flg'].values
X_K=Customer_K.drop('next_12mo_buyer_flg', axis=1).values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X_train_K, X_test_K, y_train_K, y_test_K= train_test_split(X_K, y_K, test_size=0.2, random_state=42, stratify=y_K) 
knn=KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_K, y_train_K)
knn.score(X_test_K, y_test_K)

In [None]:
y_pred=knn.predict(X_test_K)
print("prediction:{}".format(y_pred))

In [None]:
y_pred_proba=knn.predict_proba(X_test_K)
print("prediction:{}".format(y_pred_proba))
result_K=DataFrame(y_pred_proba)
result_K.columns = ['flag = 0', 'flag = 1']
result_K.to_csv("KNeighbors_predictions_test.csv", index=False) 
#Export file

### Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

y=Customer_K['next_12mo_buyer_flg'].values
X=Customer_K.drop('next_12mo_buyer_flg', axis=1).values

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 
reg_all = LinearRegression()
reg_all.fit(X_train, y_train)
y_pred = reg_all.predict(X_test)

In [None]:
reg_all.score(X_test, y_test)

#### 5-fold cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(reg_all, X_test, y_test, cv=5)
cv_scores