In [52]:
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


<h4>Reading dataset</h4>

In [28]:
# Reading purchase anamaly dataset 
df = pd.read_csv('user-login-anamoly.csv',encoding = 'unicode_escape')
# droping all rows with NA
df.dropna()
# data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   prefered_device     1000 non-null   int64
 1   registered_ip       1000 non-null   int64
 2   average_spend       1000 non-null   int64
 3   multiple_sessions   1000 non-null   int64
 4   preferred_location  1000 non-null   int64
 5   preferred_category  1000 non-null   int64
 6   is_legitimate_user  1000 non-null   int64
dtypes: int64(7)
memory usage: 54.8 KB


<h4>Verifying data</h4>

In [35]:
df.loc[(df['prefered_device'] == 1) & (df['average_spend'] == 1)]

Unnamed: 0,prefered_device,registered_ip,average_spend,multiple_sessions,preferred_location,preferred_category,is_legitimate_user
6,1,0,1,0,0,1,1
8,1,0,1,0,1,0,1
24,1,0,1,0,0,0,1
25,1,1,1,0,1,1,1
27,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...
981,1,1,1,1,0,0,0
986,1,1,1,1,0,1,0
990,1,1,1,0,1,1,1
993,1,1,1,0,0,1,1


In [37]:
df.corr()

Unnamed: 0,prefered_device,registered_ip,average_spend,multiple_sessions,preferred_location,preferred_category,is_legitimate_user
prefered_device,1.0,-0.028677,0.009003,-0.003916,-0.023331,0.027286,0.151265
registered_ip,-0.028677,1.0,0.027316,-0.039767,-0.049815,-0.034571,0.02331
average_spend,0.009003,0.027316,1.0,-0.084629,0.020576,0.023873,0.463014
multiple_sessions,-0.003916,-0.039767,-0.084629,1.0,-0.028336,0.012342,-0.560683
preferred_location,-0.023331,-0.049815,0.020576,-0.028336,1.0,-0.009325,0.238162
preferred_category,0.027286,-0.034571,0.023873,0.012342,-0.009325,1.0,0.032495
is_legitimate_user,0.151265,0.02331,0.463014,-0.560683,0.238162,0.032495,1.0


<h4>Splitting Feature and Target variables</h4>

In [41]:
feature_variables = df.drop('is_legitimate_user', axis=1)  # Features
target_variables = df['is_legitimate_user']

X_train, X_test, y_train, y_test = train_test_split(feature_variables, target_variables, test_size=0.2, random_state=42)

<h4>Initializing LogisticRegression Model</h4>

In [43]:
## Initializing Logistic regression Model
model = LogisticRegression()
## Fiting data
model.fit(X_train, y_train)

<h4>Predicting against test data</h4>

In [53]:
## Predicting test data
y_pred = model.predict(X_test)
## saving the offer eligibility scaler model 
joblib.dump(model, filename="./models/user_anamoly_prediction_model.joblib")

['./models/user_anamoly_prediction_model.joblib']

<h4>Model Accuracy</h4>

In [45]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.995


<h4>Testing with custom Inputs</h4>

In [51]:
custom_input = [
   { "prefered_device" : 1, "registered_ip" : 1, "average_spend" : 1, "multiple_sessions" : 0, "preferred_location" : 1, "preferred_category" : 1 }, ## Checking for a legitimate user
   { "prefered_device" : 1, "registered_ip" : 1, "average_spend" : 1, "multiple_sessions" : 1, "preferred_location" : 1, "preferred_category" : 1 }, ## Checking for users with multiple sessions
   { "prefered_device" : 1, "registered_ip" : 0, "average_spend" : 0, "multiple_sessions" : 0, "preferred_location" : 0, "preferred_category" : 0 } ## Checking for users with irregular spend wth un preferred category
]

custom_check_df = pd.DataFrame(columns=["prefered_device","registered_ip","average_spend","multiple_sessions","preferred_location","preferred_category"], data=custom_input)

model.predict(custom_check_df)

array([1, 0, 0])