In [42]:
import pandas as pd
import numpy as np

In [43]:
# load data from user_behavior_dataset file
df = pd.read_csv('user_behavior_dataset.csv')

In [44]:
# print the data
print(df)

     User ID        Device Model Operating System  App Usage Time (min/day)  \
0          1      Google Pixel 5          Android                       393   
1          2           OnePlus 9          Android                       268   
2          3        Xiaomi Mi 11          Android                       154   
3          4      Google Pixel 5          Android                       239   
4          5           iPhone 12              iOS                       187   
..       ...                 ...              ...                       ...   
695      696           iPhone 12              iOS                        92   
696      697        Xiaomi Mi 11          Android                       316   
697      698      Google Pixel 5          Android                        99   
698      699  Samsung Galaxy S21          Android                        62   
699      700           OnePlus 9          Android                       212   

     Screen On Time (hours/day)  Battery Drain (mAh

In [45]:
# display first few rows 
print(df.head(20))

    User ID        Device Model Operating System  App Usage Time (min/day)  \
0         1      Google Pixel 5          Android                       393   
1         2           OnePlus 9          Android                       268   
2         3        Xiaomi Mi 11          Android                       154   
3         4      Google Pixel 5          Android                       239   
4         5           iPhone 12              iOS                       187   
5         6      Google Pixel 5          Android                        99   
6         7  Samsung Galaxy S21          Android                       350   
7         8           OnePlus 9          Android                       543   
8         9  Samsung Galaxy S21          Android                       340   
9        10           iPhone 12              iOS                       424   
10       11      Google Pixel 5          Android                        53   
11       12           OnePlus 9          Android                

In [46]:
# Device Model
# App Usage Time (min/day)
# Screen On Time (hours/day)
# Number of Apps Installed
# Age
# Gender 
selected_cols = ['Device Model', 'App Usage Time (min/day)', 'Screen On Time (hours/day)', 'Number of Apps Installed', 'Age', 'Gender']

In [47]:
numpy_array = np.array(df[selected_cols])

In [48]:
print(numpy_array)

[['Google Pixel 5' 393 6.4 67 40 'Male']
 ['OnePlus 9' 268 4.7 42 47 'Female']
 ['Xiaomi Mi 11' 154 4.0 32 42 'Male']
 ...
 ['Google Pixel 5' 99 3.1 22 50 'Female']
 ['Samsung Galaxy S21' 62 1.7 13 44 'Male']
 ['OnePlus 9' 212 5.4 49 23 'Female']]


In [49]:
# mean, median and standard deviation
cols_to_analyze = ['App Usage Time (min/day)', 'Screen On Time (hours/day)', 'Number of Apps Installed']
stats = df[cols_to_analyze].agg(['mean', 'median', 'std'])
print(stats)

        App Usage Time (min/day)  Screen On Time (hours/day)  \
mean                  271.128571                    5.272714   
median                227.500000                    4.900000   
std                   177.199484                    3.068584   

        Number of Apps Installed  
mean                   50.681429  
median                 49.000000  
std                    26.943324  


In [50]:
'App Usage Time (min/day)'

'App Usage Time (min/day)'

In [51]:
# print the numpy array to see the changes
print(numpy_array)

[['Google Pixel 5' 393 6.4 67 40 'Male']
 ['OnePlus 9' 268 4.7 42 47 'Female']
 ['Xiaomi Mi 11' 154 4.0 32 42 'Male']
 ...
 ['Google Pixel 5' 99 3.1 22 50 'Female']
 ['Samsung Galaxy S21' 62 1.7 13 44 'Male']
 ['OnePlus 9' 212 5.4 49 23 'Female']]


In [52]:
# count for device model used
device_model_counts = df['Device Model'].value_counts()
print(device_model_counts)

Device Model
Xiaomi Mi 11          146
iPhone 12             146
Google Pixel 5        142
OnePlus 9             133
Samsung Galaxy S21    133
Name: count, dtype: int64


# 4. Data Cleaning with Pandas

#### Handle Missing Data using dropna() or fillna() depending on the nature of the missing data

In [53]:
# Check for missing values
missing_data = df.isnull().sum()
print(missing_data)


User ID                       0
Device Model                  0
Operating System              0
App Usage Time (min/day)      0
Screen On Time (hours/day)    0
Battery Drain (mAh/day)       0
Number of Apps Installed      0
Data Usage (MB/day)           0
Age                           0
Gender                        0
User Behavior Class           0
dtype: int64


#### There are no missing values in this DataFrame! 

#### Check for and remove duplicate entries using drop_duplicates(). 

In [54]:
# Check for duplicate rows
duplicates = df.duplicated()
print(duplicates.sum())  # Number of duplicate rows


0


#### There are 0 duplicate rows! 

# 5. Data Filtering and Selection

### Filter the rows where a certain column's value is greater than a threshold (df[df['column']>value]). 

In [55]:
# Example: Filter rows where 'App Usage Time (min/day)' is greater than 300
filtered_df = df[df['App Usage Time (min/day)'] > 300]
print(filtered_df)


     User ID        Device Model Operating System  App Usage Time (min/day)  \
0          1      Google Pixel 5          Android                       393   
6          7  Samsung Galaxy S21          Android                       350   
7          8           OnePlus 9          Android                       543   
8          9  Samsung Galaxy S21          Android                       340   
9         10           iPhone 12              iOS                       424   
..       ...                 ...              ...                       ...   
689      690  Samsung Galaxy S21          Android                       541   
692      693        Xiaomi Mi 11          Android                       378   
693      694        Xiaomi Mi 11          Android                       505   
694      695  Samsung Galaxy S21          Android                       564   
696      697        Xiaomi Mi 11          Android                       316   

     Screen On Time (hours/day)  Battery Drain (mAh

### Use .loc[] and .iloc[] to select specific rows and columns

In [56]:
# Example: Select rows 0 to 3 and columns 'Device Model' and 'App Usage Time (min/day)'
selected_data = df.loc[0:3, ['Device Model', 'App Usage Time (min/day)']]
print(selected_data)


     Device Model  App Usage Time (min/day)
0  Google Pixel 5                       393
1       OnePlus 9                       268
2    Xiaomi Mi 11                       154
3  Google Pixel 5                       239


In [57]:
# Example: Select the first 5 rows and the first 3 columns (based on numerical index)
selected_data = df.iloc[0:5, 0:3]  # Rows 0-4 and columns 0-2
print(selected_data)


   User ID    Device Model Operating System
0        1  Google Pixel 5          Android
1        2       OnePlus 9          Android
2        3    Xiaomi Mi 11          Android
3        4  Google Pixel 5          Android
4        5       iPhone 12              iOS


# 6. Sorting and Ranking

### Sort the dataset by specific columns using sort_values() 

In [58]:
# Sorting the DataFrame by 'App Usage Time (min/day)' (ascending) and 'Battery Drain (mAh/day)' (descending)
sorted_df = df.sort_values(by=['App Usage Time (min/day)'], ascending=[True])
print(sorted_df)



     User ID        Device Model Operating System  App Usage Time (min/day)  \
355      356  Samsung Galaxy S21          Android                        30   
337      338  Samsung Galaxy S21          Android                        30   
244      245           OnePlus 9          Android                        30   
73        74        Xiaomi Mi 11          Android                        31   
163      164           iPhone 12              iOS                        32   
..       ...                 ...              ...                       ...   
654      655      Google Pixel 5          Android                       594   
166      167      Google Pixel 5          Android                       595   
184      185        Xiaomi Mi 11          Android                       597   
341      342           iPhone 12              iOS                       597   
367      368           OnePlus 9          Android                       598   

     Screen On Time (hours/day)  Battery Drain (mAh

#### I decided to sort the DataFrame by the App Usage column and put it in ascending order. 

### Rank coulmns using rank() to assign relative ranks to data entries within a column

### ???? 

# 7. Grouping and Aggregation

### Use groupby() to group data by a categorical column and perform an aggregate operation like sum(), mean(), count(). 

In [59]:
# Count the number of users per operating system
user_count = df.groupby('Operating System')['User ID'].count()

print("User Count by Operating System:")
print(user_count)


User Count by Operating System:
Operating System
Android    554
iOS        146
Name: User ID, dtype: int64


#### This code will allow us to see a count of how many users are under each operating system. 

#### Explore various aggregations like min(),max(), mean(), std(). 

In [60]:
# Minimum Battery Drain
min_battery_drain = df.groupby('Operating System')['Battery Drain (mAh/day)'].min()
print("Minimum Battery Drain:")
print(min_battery_drain)


Minimum Battery Drain:
Operating System
Android    302
iOS        308
Name: Battery Drain (mAh/day), dtype: int64


#### This results in the minimum battery drain for each operating system. 

In [61]:
# Maximum App Usage Time
max_app_usage_time = df.groupby('Operating System')['App Usage Time (min/day)'].max()
print("\nMaximum App Usage Time:")
print(max_app_usage_time)



Maximum App Usage Time:
Operating System
Android    598
iOS        597
Name: App Usage Time (min/day), dtype: int64


#### This is the maximum app usage time on each operating system

In [62]:
# Mean Battery Drain
mean_battery_drain = df.groupby('Operating System')['Battery Drain (mAh/day)'].mean()
print("\nMean Battery Drain:")
print(mean_battery_drain)



Mean Battery Drain:
Operating System
Android    1508.198556
iOS        1589.513699
Name: Battery Drain (mAh/day), dtype: float64


### This output shows the mean of the battery drain for the operating systems

In [63]:

# Standard Deviation of App Usage Time
std_app_usage_time = df.groupby('Operating System')['App Usage Time (min/day)'].std()
print("\nStandard Deviation of App Usage Time:")
print(std_app_usage_time)



Standard Deviation of App Usage Time:
Operating System
Android    179.188678
iOS        169.592390
Name: App Usage Time (min/day), dtype: float64


### This is the standard deviation for the app usage time for both Android and iOS