In [5]:
import pandas as pd
import plotly as pl
import plotly.express as px
DATA = pd.read_csv("online_shoppers_intention.csv")

Task 1: Get data on screen
Goal: Make visualizations that help understand the data

In [None]:
# New- vs. Returning customers pie chart
fig = px.pie(DATA, names='VisitorType', title='Types of Customers')
fig.show()



# DURATION BOX PLOTTED
# Product related has a couple (<10) extreme outliers, had to filter those out
threshold = 15000 

filtered_data = DATA[
    (DATA['Administrative_Duration'] <= threshold) &
    (DATA['Informational_Duration'] <= threshold) &
    (DATA['ProductRelated_Duration'] <= threshold)
]

fig = px.box(filtered_data, y=['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration'])
fig.show()


# BOUNCERATES VS EXITRATES scatter
fig = px.scatter(DATA, x="BounceRates", y="ExitRates")
fig.show()


## PAGEVALUES PER MONTH
monthly_average = DATA.groupby('Month')['PageValues'].mean().reset_index()
# Create the line chart
fig = px.line(monthly_average, y="PageValues", x="Month", title="Average Page Values by Month")
fig.show()


# EXITRATES PER TRAFFICTYPE
fig = px.bar(DATA, x="TrafficType", y="ExitRates",
             color='TrafficType', barmode='stack',
             height=400)
fig.show()

purchase_counts = DATA[DATA['Revenue']].groupby('Month').size().reset_index(name='Count')

fig = px.bar(purchase_counts, x='Month', y='Count', title='Count of Purchases by Month')
fig.show()

# PURCHASES CLOSE TO IMPORTANT DATES
SD_purchase_data = DATA[DATA['Revenue']]

SD_purchase_counts = SD_purchase_data['SpecialDay'].value_counts().reset_index()
SD_purchase_counts.columns = ['SpecialDay', 'Count']

SD_purchase_counts = SD_purchase_counts.sort_values('SpecialDay')

fig = px.line(SD_purchase_counts, x='SpecialDay', y='Count', title='Count of Purchases by Days to Special Day')

fig.update_layout(xaxis=dict( autorange="reversed"))

fig.show()

In [25]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
# candidates for one-hot-encoding (OHE)
print(DATA.shape[1])
print(DATA.columns)

featuresOHN = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType']
#data[featuresOHN] = data[featuresOHN].astype(str)
transformer = make_column_transformer((OneHotEncoder(categories='auto', sparse_output=False), featuresOHN), remainder='passthrough')
transformed = transformer.fit_transform(DATA)
dataOHN = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
flist = dataOHN.columns.to_list()
newflist = [col.replace('remainder__', '') if col.startswith('remainder__') else col.replace('onehotencoder__', '') for col in flist]
dataOHN.columns = newflist

print(dataOHN.shape[1])
print(dataOHN.columns)

