In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("online_shoppers_intention.csv") #Reading the dataset
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


# Preprocessing

checking for null and cleaning

In [3]:
print("The total null vcalues are:",df.isna().sum().sum())

The total null vcalues are: 0


# Encoding the columns

Find the columns to encode from a dictionary

In [4]:
{column: list(df[column].unique()) for column in df.columns if df.dtypes[column] == 'object'}

{'Month': ['Feb',
  'Mar',
  'May',
  'Oct',
  'June',
  'Jul',
  'Aug',
  'Nov',
  'Sep',
  'Dec'],
 'VisitorType': ['Returning_Visitor', 'New_Visitor', 'Other']}

In [5]:
""" def encode(df,column,order):
    df = df.copy()
    df[column] = df[column].apply(lambda x : order.index(x))
    return df

def encode_visit(df,column,prefix):
    df = df.copy()
    dummiescol = pd.get_dummies(df[column],prefix=prefix)
    df = pd.concat([df,dummiescol],axis=1)
    df = df.drop(column,axis=1)
    return df """

' def encode(df,column,order):\n    df = df.copy()\n    df[column] = df[column].apply(lambda x : order.index(x))\n    return df\n\ndef encode_visit(df,column,prefix):\n    df = df.copy()\n    dummiescol = pd.get_dummies(df[column],prefix=prefix)\n    df = pd.concat([df,dummiescol],axis=1)\n    df = df.drop(column,axis=1)\n    return df '

In [6]:
months = ['Jan','Feb','Mar','Apr','May','June','Jul','Aug','Sep','Oct','Nov','Dec'] #encoding these values with numbers

""" visitor = 'V' """

" visitor = 'V' "

In [7]:
#Using the encode function to apply month encoding 

""" encode(df,'Month',months)

encode_visit(df,'VisitorType',visitor) """

" encode(df,'Month',months)\n\nencode_visit(df,'VisitorType',visitor) "

In [8]:
df['Month'] = df['Month'].apply(lambda x : months.index(x))

In [9]:
dummiescol = pd.get_dummies(df['VisitorType'],prefix='V')
df = pd.concat([df,dummiescol],axis=1)
df = df.drop('VisitorType',axis=1)

In [10]:
#Making the weekend and revenue as boolean features

df['Weekend'] = df['Weekend'].astype(int)
df['Revenue'] = df['Revenue'].astype(int)

In [11]:
df #we have converted all the data into numerical values

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,Weekend,Revenue,V_New_Visitor,V_Other,V_Returning_Visitor
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,1,1,1,1,1,0,0,0,0,1
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,1,2,2,1,2,0,0,0,0,1
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,1,4,1,9,3,0,0,0,0,1
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,1,3,2,2,4,0,0,0,0,1
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,1,3,3,1,4,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,11,4,6,1,1,1,0,0,0,1
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,10,3,2,1,8,1,0,0,0,1
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,10,3,2,1,13,1,0,0,0,1
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,10,2,2,3,11,0,0,0,0,1


# Train test split

In [12]:
y = df['Revenue'].copy() #Value  to be predicted
X = df.drop('Revenue',axis=1)


In [13]:
scalar = StandardScaler()

X = scalar.fit_transform(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,random_state=20)

# Training the model

In [15]:
models = []
inv_reg_strength = [0.01,0.1,1.0,10.0,100.0]

for i in range(len(inv_reg_strength)):
    model = LogisticRegression(C=inv_reg_strength[i])
    model.fit(X_train,y_train)
    models.append(model)

In [16]:
model_acc = [model.score(X_test,y_test) for model in models]

print(f" Model accuracy 1",model_acc[0])
print(f" Model accuracy 2",model_acc[1])
print(f" Model accuracy 3",model_acc[2])
print(f" Model accuracy 4",model_acc[3])
print(f" Model accuracy 5",model_acc[4])

 Model accuracy 1 0.8832116788321168
 Model accuracy 2 0.8859151121924844
 Model accuracy 3 0.8861854555285212
 Model accuracy 4 0.8867261422005948
 Model accuracy 5 0.8867261422005948


***This concludes the logistic regression model***
