In [1]:
# upload libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#to scale the data using z-score 
from sklearn.preprocessing import StandardScaler

#importing PCA and TSNE
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


from matplotlib import style
style.use('dark_background')

In [3]:
#read the datafile into a dataframe
data_raw = pd.read_csv("dat/online_shoppers_intention.csv.gz")


In [4]:
#revied head and other info of data
data_raw.shape , data_raw.head(), data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

((12330, 18),
    Administrative  Administrative_Duration  Informational  \
 0               0                      0.0              0   
 1               0                      0.0              0   
 2               0                      0.0              0   
 3               0                      0.0              0   
 4               0                      0.0              0   
 
    Informational_Duration  ProductRelated  ProductRelated_Duration  \
 0                     0.0               1                 0.000000   
 1                     0.0               2                64.000000   
 2                     0.0               1                 0.000000   
 3                     0.0               2                 2.666667   
 4                     0.0              10               627.500000   
 
    BounceRates  ExitRates  PageValues  SpecialDay Month  OperatingSystems  \
 0         0.20       0.20         0.0         0.0   Feb                 1   
 1         0.00       0.10  

In [15]:
# storing numerical and categorical feature names

from ipaddress import collapse_addresses


col_names = [name for name in data_raw.dtypes.index]
dtypes = [dtype for dtype in data_raw.dtypes.tolist()]

numeric_features = [name for name, dtype in zip(col_names, dtypes) if (dtype == 'int64') | (dtype == 'float64')]
categorical_features = [name for name in col_names if name not in numeric_features]

numeric_features , categorical_features

(['Administrative',
  'Administrative_Duration',
  'Informational',
  'Informational_Duration',
  'ProductRelated',
  'ProductRelated_Duration',
  'BounceRates',
  'ExitRates',
  'PageValues',
  'SpecialDay',
  'OperatingSystems',
  'Browser',
  'Region',
  'TrafficType'],
 ['Month', 'VisitorType', 'Weekend', 'Revenue'])

In [6]:
data_raw["Revenue"].mean()  #see how many purchases there are as a % of the total observations

0.15474452554744525

In [7]:
# checking unique values for features

for column in data_raw.columns:
    print( column , len(data_raw[column].unique()))



Administrative 27
Administrative_Duration 3335
Informational 17
Informational_Duration 1258
ProductRelated 311
ProductRelated_Duration 9551
BounceRates 1872
ExitRates 4777
PageValues 2704
SpecialDay 6
Month 10
OperatingSystems 8
Browser 13
Region 9
TrafficType 20
VisitorType 3
Weekend 2
Revenue 2


In [8]:
# checking unique values in features

for column in data_raw.columns:
    if len(data_raw[column].unique()) <= 30:
        print(column)
        print(data_raw[column].unique())
        print()


Administrative
[ 0  1  2  4 12  3 10  6  5  9  8 16 13 11  7 18 14 17 19 15 24 22 21 20
 23 27 26]

Informational
[ 0  1  2  4 16  5  3 14  6 12  7  9 10  8 11 24 13]

SpecialDay
[0.  0.4 0.8 1.  0.2 0.6]

Month
['Feb' 'Mar' 'May' 'Oct' 'June' 'Jul' 'Aug' 'Nov' 'Sep' 'Dec']

OperatingSystems
[1 2 4 3 7 6 8 5]

Browser
[ 1  2  3  4  5  6  7 10  8  9 12 13 11]

Region
[1 9 2 3 4 5 6 7 8]

TrafficType
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 18 19 16 17 20]

VisitorType
['Returning_Visitor' 'New_Visitor' 'Other']

Weekend
[False  True]

Revenue
[False  True]



In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [32]:
# YOUR CODE HERE (imports!)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report




def train(X, y, standardize = True) -> None:
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 0 , test_size= 0.2)

    if standardize:
        numeric_transformer = Pipeline(steps= [("scaler" , StandardScaler())])
        pipeline = Pipeline(steps = [("numeric", numeric_transformer), 
        ("logistic_regression", LogisticRegression(class_weight= "balanced", random_state= 0))])

    else:
        pipeline = Pipeline(steps= [("logistic_regresion", LogisticRegression(class_weight= "balanced",random_state= 0))])
        

    pipeline.fit(X_train, y_train)   

    y_pred = pipeline.predict(X_test)


    print(classification_report(y_test, y_pred))
    
    # YOUR CODE HERE

In [34]:
# sintax to save data into csv later
#df_reduced.to_csv("dat/UNR_UDD_reduced.csv")

In [11]:
#replacing boolean values in target
df_temp = data_raw["Revenue"].replace(to_replace= ["True", "False"], value= [1 , 0]) #way of replacing the binnary label feature, but have to point to the column to replade
(df_temp == 0).sum(), (df_temp == 1).sum(), df_temp.mean()

(10422, 1908, 0.15474452554744525)