In [451]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
from sklearn.preprocessing import StandardScaler

In [457]:
phone_data = pd.read_csv("user_behavior_dataset.csv")

phone_data.head(20)

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3
5,6,Google Pixel 5,Android,99,2.0,940,35,564,31,Male,2
6,7,Samsung Galaxy S21,Android,350,7.3,1802,66,1054,21,Female,4
7,8,OnePlus 9,Android,543,11.4,2956,82,1702,31,Male,5
8,9,Samsung Galaxy S21,Android,340,7.7,2138,75,1053,42,Female,4
9,10,iPhone 12,iOS,424,6.6,1957,75,1301,42,Male,4


In [453]:
phone_data.isnull().any() # Check for nulls

User ID                       False
Device Model                  False
Operating System              False
App Usage Time (min/day)      False
Screen On Time (hours/day)    False
Battery Drain (mAh/day)       False
Number of Apps Installed      False
Data Usage (MB/day)           False
Age                           False
Gender                        False
User Behavior Class           False
dtype: bool

In [454]:
# Preprocessing and data cleaning
# 1. Cleaning up null values
# 2. Data cleaning (dashes, odd characters, etc)
# 3. Remove extreme outliers
# 4. One-Hot encoding
# 5. Convert Categorical values to numerical (sometimes that is the same as 3)
# 6. Standardization/Normalization
# 7. Deal with multicollinearity (can be caused by 3)

# I want to manually encode binary column values such as those in Operating System and Gender to 0 and 1, respectively. For string columns such as Device Model, however, I will use one hot encoding to minimize potential errors.

# First, though, null values will need to be handled effectively. I will replace all null values in binary or ordinal columns with the column's mode, since that means that there is essentially a 50% chance to get the entry "right," but I will use the column's median for numerical columns.

# Miscellaneous null cleaning, not needed as seen above, but might as well!
def cleanNull(df: DataFrame = phone_data) -> DataFrame:
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']: # Numerical columns
            median = df[col].median()
            df[col] = df[col].fillna(median)
        elif df[col].dtype == 'object': # Categorical columns
            mode = df[col].mode().iloc[0]
            df[col] = df[col].fillna(mode)
    return df

phone_data = cleanNull(phone_data)

def dataClean(df: DataFrame = phone_data) -> DataFrame:
    # Loop through categorical columns to clean text data
    for col in df.columns:
        # If the column is categorical, strip any leading or trailing whitespace
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()
            # Convert all text to lowercase for uniformity
            df[col] = df[col].str.lower()

    return df

phone_data = dataClean(phone_data)

# Manual encoding for Operating System
def std_os(col: pd.Series) -> pd.Series:
    os = {'ios': 0, 'android': 1}
    
    # Map the values based on the dictionary
    col = col.map(os)
    
    return col

phone_data['Operating System'] = std_os(phone_data['Operating System'])

# Manual encoding for Gender
def std_gender(col: pd.Series) -> pd.Series:
    gender_map = {'male': 0, 'female': 1}  
    
    # Map the values based on the dictionary
    col = col.map(gender_map)
    
    return col

phone_data['Gender'] = std_gender(phone_data['Gender'])



# One hot encode all categorical string columns
def encode(df: pd.DataFrame = phone_data) -> pd.DataFrame:
    # Select only categorical string columns
    for col in df.columns:
        # If the column is categorical, strip any leading or trailing whitespace
        if df[col].dtype == 'object':
            # Encode
            encoded = pd.get_dummies(df)
    
    return encoded

phone_data = encode(phone_data)



In [455]:
phone_data.to_csv('user_behavior_preprocessed.csv', index = False)

In [456]:
phone_data.head(20)

Unnamed: 0,User ID,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class,Device Model_google pixel 5,Device Model_iphone 12,Device Model_oneplus 9,Device Model_samsung galaxy s21,Device Model_xiaomi mi 11
0,1,1,393,6.4,1872,67,1122,40,0,4,True,False,False,False,False
1,2,1,268,4.7,1331,42,944,47,1,3,False,False,True,False,False
2,3,1,154,4.0,761,32,322,42,0,2,False,False,False,False,True
3,4,1,239,4.8,1676,56,871,20,0,3,True,False,False,False,False
4,5,0,187,4.3,1367,58,988,31,1,3,False,True,False,False,False
5,6,1,99,2.0,940,35,564,31,0,2,True,False,False,False,False
6,7,1,350,7.3,1802,66,1054,21,1,4,False,False,False,True,False
7,8,1,543,11.4,2956,82,1702,31,0,5,False,False,True,False,False
8,9,1,340,7.7,2138,75,1053,42,1,4,False,False,False,True,False
9,10,0,424,6.6,1957,75,1301,42,0,4,False,True,False,False,False
