# Data load and clean 

In [1]:
import pandas as pd 
import plotly.graph_objs as go
import plotly.express as px
import sweetviz as sv
from funcs import feature_engineer as fe

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load data 
df_data = pd.read_csv("data/uselog.csv")


# 2. Time clean # Missing eliminate
df_data["timestamp"] = pd.to_datetime(df_data["timestamps of usage"])
df_data = fe.crt_multiple_timestamp_columns(df_data, "timestamp")
df_data =df_data.loc[~df_data["timestamp"].isna(),]

#3. User data create 
ufe = fe.UserFE(df_data)
df_user = ufe.crt_user_features()

# 3. Save result 
df_data.to_csv("data/uselog_fe.csv")
# del df_data["timestamps of usage"]

# Data description & Basic EDA

In [4]:
# 1. Data description with sweetviz analysis to know the basic info 
# 1-1. Function usage event : 181,978 functions call
report = sv.analyze(df_data, pairwise_analysis="off")
report.show_html('report/SweetvizBascInfo.html')

# 1-2. User basic info : 237 people
report_user = sv.analyze(df_user, pairwise_analysis="off")
report_user.show_html('report/SweetvizUser.html')
user_multitype_num = (df_user["user type"].str.len()>=2).sum()

print(f"Users with multi user type : {user_multitype_num}")
print(f"Timestamp Start : {df_data['timestamp'].min()}, Timestamp End {df_data['timestamp'].max()}")

Feature: date                                |█████████████████████████████████████████| [100%]   00:04 -> (00:00 left)


Report report/SweetvizBascInfo.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


Feature: DayUsageRatio                       |█████████████████████████████████████████| [100%]   00:04 -> (00:00 left)

Report report/SweetvizUser.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
Users with multi user type : 42
Timestamp Start : 2023-04-01 00:00:00, Timestamp End 2023-06-30 23:58:00





In [3]:
df_user

Unnamed: 0,ID,place of residence,user type,TotalUsage,Morning,Afternoon,Evening,Midnight,DayUsage,DayUsageRatio
0,9343,P03,A,492,0.569106,0.302846,0.128049,0.000000,44,0.488889
1,24015,P05,AS,234,0.888889,0.089744,0.021368,0.000000,48,0.533333
2,2720,P01,AS,1549,0.191091,0.000000,0.540994,0.267915,64,0.711111
3,3891,P01,A,988,0.560729,0.260121,0.179150,0.000000,49,0.544444
4,6415,P02,AS,278,0.694245,0.223022,0.082734,0.000000,51,0.566667
...,...,...,...,...,...,...,...,...,...,...
217,18460,P01,L,14,0.000000,0.000000,0.571429,0.428571,7,0.077778
218,22504,P02,A,9,0.000000,0.000000,0.777778,0.222222,8,0.088889
219,4958,P02,S,16,0.375000,0.625000,0.000000,0.000000,15,0.166667
220,8535,P03,A,19,0.947368,0.052632,0.000000,0.000000,1,0.011111


# 繪圖區

In [None]:
# 創建圖形
fig = go.Figure()
fig = px.histogram(df_proportions, x='Afternoon', nbins=10, title='Histogram of Values')
fig.show()