In [1]:
# Load the dataset
teens <- read.csv("snsdata.csv", stringsAsFactors = TRUE)

# Display the structure of the dataset
str(teens)

'data.frame':	30000 obs. of  40 variables:
 $ gradyear    : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
 $ gender      : Factor w/ 2 levels "F","M": 2 1 2 1 NA 1 1 2 1 1 ...
 $ age         : num  19 18.8 18.3 18.9 19 ...
 $ friends     : int  7 0 69 0 10 142 72 17 52 39 ...
 $ basketball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ football    : int  0 1 1 0 0 0 0 0 0 0 ...
 $ soccer      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ softball    : int  0 0 0 0 0 0 0 1 0 0 ...
 $ volleyball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ swimming    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cheerleading: int  0 0 0 0 0 0 0 0 0 0 ...
 $ baseball    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tennis      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sports      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cute        : int  0 1 0 1 0 0 0 0 0 1 ...
 $ sex         : int  0 0 0 0 1 1 0 2 0 0 ...
 $ sexy        : int  0 0 0 0 0 0 0 1 0 0 ...
 $ hot         : int  0 0 0 0 0 0 0 0 0 1 ...
 $ kissed      : int  0 0 0 0 5 0 0 0 0 0 ...
 $ dance       : int

In [2]:
# Replace ages below 13 and above 20 by NA
teens$age <- ifelse(teens$age >= 13 & teens$age < 20, teens$age, NA)

# Check the result
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  13.03   16.30   17.27   17.25   18.22   20.00    5523 

In [3]:
# Create a separate category for the missing value
teens$no_gender <- ifelse(is.na(teens$gender), 1, 0)

# Check for correctness (display counts of possible NAs)
table(teens$no_gender, useNA = "ifany")

# Create a category for females; the second clause is needed so NAs get assigned into the "0" class
teens$female <- ifelse(teens$gender == "F" & !is.na(teens$gender), 1, 0)

# Check for correctness (display counts of possible NAs)
table(teens$female, useNA = "ifany")


    0     1 
27276  2724 


    0     1 
 7946 22054 

In [4]:
# Try to compute the mean age of all students
mean(teens$age)

# Compute the mean age removing the missing values
mean(teens$age, na.rm = TRUE)

In [5]:
# Apply the function "mean(..., na.rm = TRUE)" to the "age" feature over the levels of "gradyear"
ave_age <- ave(teens$age, teens$gradyear, FUN = function(x) mean(x, na.rm = TRUE))

# Check the distribution: we have 7500 values for each average age
table(ave_age)

ave_age
15.8195733445096 16.7677007371007 17.7061723749799 18.6558579508727 
            7500             7500             7500             7500 

In [6]:
# If there is a missing value, use "ave_age"; otherwise use the original age
teens$age <- ifelse(is.na(teens$age), ave_age, teens$age)

# Check the result: there should be no missing values
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.03   16.28   17.24   17.24   18.21   20.00 

In [7]:
# Create data frame with the "interest" features
interests <- teens[5:40]

# Can check with summary: uncomment the command to see
#summary(interests)

In [8]:
# Standardize "interests" using "scale" (makes mean = 0 and standard deviation = 1)
interests_z <- as.data.frame(lapply(interests, scale))

# Check the first feature
summary(interests$basketball)
summary(interests_z$basketball)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.2673  0.0000 24.0000 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.3322 -0.3322 -0.3322  0.0000 -0.3322 29.4923 

In [9]:
# Load the library
library(stats)

# Set a seed
set.seed(2345)

# Train the model
teen_clusters <- kmeans(interests_z, centers = 5)

# Display the structure of the output
str(teen_clusters)

List of 9
 $ cluster     : int [1:30000] 5 3 5 5 1 5 3 5 5 3 ...
 $ centers     : num [1:5, 1:36] 0.36216 -0.09443 0.00398 1.37233 -0.18682 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:5] "1" "2" "3" "4" ...
  .. ..$ : chr [1:36] "basketball" "football" "soccer" "softball" ...
 $ totss       : num 1079964
 $ withinss    : num [1:5] 184943 36141 298540 161486 265012
 $ tot.withinss: num 946123
 $ betweenss   : num 133841
 $ size        : int [1:5] 1038 601 4066 2696 21599
 $ iter        : int 7
 $ ifault      : int 0
 - attr(*, "class")= chr "kmeans"


In [10]:
# Display the cluster sizes
teen_clusters$size

In [11]:
# Display the cluster centers (transposed for better viewability)
t(teen_clusters$centers)

Unnamed: 0,1,2,3,4,5
basketball,0.36216073,-0.094426312,0.003980104,1.372334818,-0.18682209
football,0.37985213,0.066917685,0.095240622,1.19570343,-0.18729427
soccer,0.13734997,-0.099560092,0.053421088,0.556210971,-0.08331351
softball,0.12721074,-0.037972504,-0.049686403,1.130452734,-0.13680721
volleyball,0.09247518,-0.072862022,-0.014596479,1.071772105,-0.13344819
swimming,0.26180286,0.045784007,0.329449341,0.0851321,-0.08650052
cheerleading,0.21599455,-0.107037005,0.514245096,0.040036702,-0.10920564
baseball,0.25312305,-0.111829408,-0.049336281,1.092797366,-0.13616893
tennis,0.11991682,0.040273354,0.06703386,0.138871838,-0.03683671
sports,0.77040675,-0.106386126,-0.054350928,1.083160966,-0.15903307


In [12]:
# Add a new feature "cluster" to "teens"
teens$cluster <- teen_clusters$cluster

In [13]:
# Display first 10 individuals' personal information
teens[1:10, c("cluster", "gender", "age", "friends")]

Unnamed: 0_level_0,cluster,gender,age,friends
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<int>
1,5,M,18.982,7
2,3,F,18.801,0
3,5,M,18.335,69
4,5,F,18.875,0
5,1,,18.995,10
6,5,F,18.65586,142
7,3,F,18.93,72
8,5,M,18.322,17
9,5,F,19.055,52
10,3,F,18.708,39


In [14]:
# Compute means of ages aggregated by "cluster"
aggregate(data = teens, age ~ cluster, mean)

cluster,age
<int>,<dbl>
1,17.09319
2,17.38488
3,17.03773
4,17.03759
5,17.30265


In [15]:
# Original proportions of gender
round(proportions(table(teens$gender, useNA = "ifany")) * 100, 1)


   F    M <NA> 
73.5 17.4  9.1 

In [16]:
# Compute proportions of females, males, and NAs in the clusters
aggregate(data = teens, female ~ cluster, mean)
aggregate(data = teens, (1-female-no_gender) ~ cluster, mean)
aggregate(data = teens, no_gender ~ cluster, mean)

cluster,female
<int>,<dbl>
1,0.8025048
2,0.7237937
3,0.8866208
4,0.6984421
5,0.7082735


cluster,(1 - female - no_gender)
<int>,<dbl>
1,0.13872832
2,0.21797005
3,0.04254796
4,0.25
5,0.1898236


cluster,no_gender
<int>,<dbl>
1,0.05876686
2,0.05823627
3,0.07083128
4,0.05155786
5,0.10190287


In [17]:
# Compute means of "friends" aggregated by "cluster"
aggregate(data = teens, friends ~ cluster, mean)

cluster,friends
<int>,<dbl>
1,30.6657
2,32.79368
3,38.54575
4,35.91728
5,27.79221
