-
Notifications
You must be signed in to change notification settings - Fork 0
/
pedFAM.R
121 lines (97 loc) · 4.09 KB
/
pedFAM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env RScript
#' 1. This script will generate the FAM for fastFAM using provided pedigree information
#' (like the ukb_inferredRelationship.txt by UKB cohort).
#' 2. Two input files are needed:
#' a. A .fam file from the PLINK format genotype. It contains the individuals' FIDs and IIDs and some other information.
#' see (https://www.cog-genomics.org/plink/1.9/formats#fam) for details.
#' b. A inferredRelationship file. We will use the UKB provided one.
#' It should contains: "IID1", "IID2", and a "relationship" columns. The "relationship" column should contain
#' the following elements (these are used by UK Biobank to indicate the inferred relationship between
#' two individuals, see https://www.biorxiv.org/content/early/2017/07/20/166298) :
#' "MZtwin",
#' "parentOffspring",
#' "fullSib",
#' "second",
#' "third"
#'
#' Author: Longda Jiang, longda.jiang@uq.edu.au
#' Date: June 28, 2018
#' Version: 1.0.1
#' Modified by Gibran Hemani to accept King output (2024-02-07)
args <- commandArgs(trailingOnly = TRUE)
if(length(args) != 3){
message("Input: path of fam file")
message("Input: path of pedigree information")
message("Output: path of output without extension")
stop("The FAM from pedigree needs 3 parameters")
}
path_to_fam <- as.character(args[1])
path_to_rel <- as.character(args[2])
output_file <- as.character(args[3])
message("Transform the pedigree information...")
message("From: ", path_to_rel)
message("To: ", output_file)
message("Align order: ", path_to_fam)
if(!file.exists(path_to_fam)){
stop("FAM file didn't exist")
}
if(!file.exists(path_to_rel)){
stop("Relationship file didn't exist")
}
###############
# Load the input data
###############
# the .fam file
fam_file <- read.table(file=path_to_fam, stringsAsFactors = F, head = F )
fam_file <- fam_file[, 1:2]
names(fam_file) <- c("FID", "IID")
fam_file$order <- 1:nrow(fam_file)
rel <- read.table(file=path_to_rel, head=F, stringsAsFactors = F)
# the rel file we only need the 1, 2, and last columns:
rel <- rel[, c(1, 2, ncol(rel))]
names(rel) <- c("IID_1", "IID_2", "relationship")
# check relation labels
relation_labels = table(rel$relationship)
print(relation_labels)
setting_labels = c("Dup/MZ", "PO", "FS", "2nd", "3rd")
rel <- subset(rel, relationship %in% setting_labels)
rel2 <- rel[,c(2,1,3)]
names(rel2) <- c("IID_1", "IID_2", "relationship")
rel <- rbind(rel, rel2)
###############
# format the data
###############
rel <- merge(rel, fam_file[,c("IID", "order")], by.x="IID_1", by.y="IID")
rel <- merge(rel, fam_file[,c("IID", "order")], by.x="IID_2", by.y="IID")
rel <- rel[, c("IID_1", "IID_2", "relationship", "order.x", "order.y")]
# The order of IID_1 is important:
# The order of IID_1 should always be larger than the order of IID_2.
# (as the sparse FAM is saved as a lower triangular matrix)
rel <- rel[order(rel$order.x, rel$order.y), ]
sum(rel$order.x >= rel$order.y)
rel <- rel[rel$order.x > rel$order.y, ]
###############
# generate the FAM
###############
rel$coef <- 0
rel[rel$relationship == "Dup/MZ", "coef"] <- 1
rel[rel$relationship == "PO", "coef"] <- 0.5
rel[rel$relationship == "FS", "coef"] <- 0.5
rel[rel$relationship == "2nd", "coef"] <- 0.25
rel[rel$relationship == "3rd", "coef"] <- 0.125
# the diagonal elements of FAM is simply 1
rel_part2 <- fam_file[,c("IID", "IID")]
rel_part2$order.x <- 1:nrow(rel_part2)
rel_part2$order.y <- 1:nrow(rel_part2)
names(rel_part2) <- c("IID_1", "IID_2", "order.x", "order.y")
rel_part2$coef <- 1
####
FAM_sp <- rbind(rel[, c("IID_1", "IID_2", "order.x", "order.y", "coef")], rel_part2)
FAM_sp <- FAM_sp[order(FAM_sp$order.x, FAM_sp$order.y), ]
### the actual order starts from 0. need to minus 1.
FAM_sp$order.x <- FAM_sp$order.x - 1
FAM_sp$order.y <- FAM_sp$order.y - 1
write.table(fam_file, file=paste0(output_file, ".grm.id"), quote=F, row.names = F, col.names = F)
write.table(FAM_sp[,c("order.x", "order.y", "coef")], file=paste0(output_file, ".grm.sp"), quote=F, row.names = F, col.names = F)
message("Sparse FAM generated: ", output_file, ".")
#### Done