In [None]:
#vcf input file
vcffile="final.recode.vcf"
skipr=5211  #count header lines in command line (grep -c "##" batch_1.vcf)
skipc=9  #non sample columns

#samples "database"
samplefile="/home/megan/megan/research/eucalyptus/eucalyptus_data/Emelliodora_PlantsSamples.csv"
accessionfile="/home/megan/megan/research/eucalyptus/eucalyptus_data/Emelliodora_Accessions.csv"

In [None]:
%load_ext rpy2.ipython

# check for outliers

In [None]:
%%R -i vcffile,skipr,skipc -o genos.gi,samplenames

library(adegenet)

#read in vcf, skipping ##header rows and removing other junk
vcf=read.delim(vcffile, sep="\t", header=T, skip=skipr)
vcf=vcf[c(-skipc:-1)] #remove non sample columns
genos=apply(vcf,2,substr,1,3)  #get just genotypes
genos=t(genos)  #transpose 
#row.names(genos)=substr(row.names(genos),9,14)  #get just sample ID
samplenames=rownames(genos)
genos[1:5,1:10]  #print bit to check its ok 
dim(genos)  #print number of samples and loci
#format for input into genid object
genos[genos=="./."]=NA
#convert to genid object
genos.gi <- df2genind(genos,sep="/",NA.char=NA)
genos.gi

In [None]:
%%R -i genos.gi,samplenames

#preliminary pca of genetic distance (to find additional outliers)
genos.dist=dist(genos.gi)
pcoa.genos=dudi.pco(genos.dist,scannf = F, nf = 3)
percent_var=round(100*pcoa.genos$eig/sum(pcoa.genos$eig),1)
#pdf("emellb_pca_280.pdf")
plot(pcoa.genos$li[,1:2], pch="",
   xlab = paste("PCOA axis 1 (", percent_var[1],"%)"), ylab = paste("PCOA axis 2 (", percent_var[2],"%)"))
  #  xlim=c(-10,10),ylim=c(-15,10))
text(pcoa.genos$li[,1:2], label=samplenames) 
#     label=samples$SampleName, cex = .6) #add names
#dev.off()

# summarize data (with outliers removed)

In [None]:
%%script env vcffile="$vcffile" bash

### PUT SAMPLE ID's HERE FOR OUTLIERS FOR REMOVAL ###
remove="--remove-indv S45093 --remove-indv S45095 --remove-indv S45193 --remove-indv S45194 --remove-indv S45195"
### PUT LOCI ID'S HERE FOR REMOVAL ###
echo Chr08 1031291 > bad.loci

vcftools --vcf $vcffile --recode --out final.core $remove --max-missing 0.4 --exclude-positions bad.loci --maf .000001
    #max-missing is 1-threshold
    #bad.loci is a file with chrom pos of loci to be removed
    #maf removes fixed loci

vcftools --vcf final.core.recode.vcf --depth
vcftools --vcf final.core.recode.vcf --site-depth
vcftools --vcf final.core.recode.vcf --site-mean-depth
vcftools --vcf final.core.recode.vcf --missing-indv
vcftools --vcf final.core.recode.vcf --missing-site
vcftools --vcf final.core.recode.vcf --het
vcftools --vcf final.core.recode.vcf --hardy
vcftools --vcf final.core.recode.vcf --freq
awk '{print $6}' out.frq | awk -F":" '{print $2}' > out.maf #generate maf file

mv final.core.recode.vcf final.vcf

In [None]:
%%R

#look at samples
idepth=read.delim("out.idepth")
hist(idepth$MEAN_DEPTH, main="mean locus depth per individual", xlab="mean depth")
imiss=read.delim("out.imiss")
hist(imiss$F_MISS, main="missingness per individual", xlab="proportion missing")
plot(1-imiss$F_MISS, idepth$MEAN_DEPTH, main="by individual", 
     xlab="proportion loci genotyped", ylab="mean depth per locus")

#sample heterozygosity
heteroz=read.delim("out.het")
hist(heteroz$F, main="sample inbreeding coefficient (F)", xlab="F")

#look at loci
ldepthtot=read.delim("out.ldepth")
ldepth=read.delim("out.ldepth.mean")
hist(ldepth$MEAN_DEPTH, main="mean individual depth per locus", xlab="mean depth")
lmiss=read.delim("out.lmiss")
hist(lmiss$F_MISS, main="missingness per locus", xlab="proportion missing")
plot(1-lmiss$F_MISS, ldepthtot$SUM_DEPTH, main="by locus", 
     xlab="proportion individuals genotyped", ylab="total locus depth")

#maf
maf=scan("out.maf")
hist(maf,breaks=100, main="alternate allele frequency", xlab="allele frequency")

#heterozygosity, hwe
hwe=read.delim("out.hwe")
plot(maf,hwe$P_HWE, xlab="alternate allele frequency", ylab="HWE (p-value)")
hist(hwe$P_HWE, main="HWE", xlab="p-value")
hist(hwe$P_HET_DEFICIT, main="heterozygote deficit", xlab="p-value")
hist(hwe$P_HET_EXCESS, main="heterozygote excess", xlab="p-value")
hwe=read.delim("out.hwe")
obs_hwe=data.frame(do.call('rbind', strsplit(as.character(hwe$OBS.HOM1.HET.HOM2.),'/',fixed=TRUE)))
obs_hwe=cbind(hwe$CHR,hwe$POS,obs_hwe)
colnames(obs_hwe)=c("CHR","POS","homo1","hets","homo2")
obs_hwe$homo1=as.numeric(as.character(obs_hwe$homo1))
obs_hwe$hets=as.numeric(as.character(obs_hwe$hets))
obs_hwe$homo2=as.numeric(as.character(obs_hwe$homo2))
obs_hwe$prop_homo1=as.numeric(as.character(obs_hwe$homo1))/(obs_hwe$homo1+obs_hwe$hets+obs_hwe$homo2)
obs_hwe$prop_hets=as.numeric(as.character(obs_hwe$hets))/(obs_hwe$homo1+obs_hwe$hets+obs_hwe$homo2)
obs_hwe$prop_homo2=as.numeric(as.character(obs_hwe$homo2))/(obs_hwe$homo1+obs_hwe$hets+obs_hwe$homo2)
hist(obs_hwe$prop_homo1, main="proportion of homozygous for reference allele", xlab="proportion")
hist(obs_hwe$prop_hets, main="proportion of heterozygotes", xlab="proportion")
hist(obs_hwe$prop_homo2, main="proportion of homozygous for alternate allele", xlab="proportion")
exp_hwe=data.frame(do.call('rbind', strsplit(as.character(hwe$E.HOM1.HET.HOM2.),'/',fixed=TRUE)))
colnames(exp_hwe)=c("homo1","hets","homo2")
exp_hwe$hets=as.numeric(as.character(exp_hwe$hets))
f=(exp_hwe$hets-obs_hwe$hets)/(exp_hwe$hets)
hist(f, main="locus F ((exp-obs)/exp)")

#matrix of counts of homozygotes (ref/alt) and heterozygotes
homohet_sum=matrix(data=NA, nrow=4, ncol=3)
rownames(homohet_sum)=c("none","one","two or less","more than two")
colnames(homohet_sum)=c("homo_ref","het","homo_alt")
homohet_sum[1,1]=sum(obs_hwe$homo1==0)
homohet_sum[1,2]=sum(obs_hwe$hets==0)
homohet_sum[1,3]=sum(obs_hwe$homo2==0)
homohet_sum[2,1]=sum(obs_hwe$homo1==1)
homohet_sum[2,2]=sum(obs_hwe$hets==1)
homohet_sum[2,3]=sum(obs_hwe$homo2==1)
homohet_sum[3,1]=sum(obs_hwe$homo1<=2)
homohet_sum[3,2]=sum(obs_hwe$hets<=2)
homohet_sum[3,3]=sum(obs_hwe$homo2<=2)
homohet_sum[4,1]=sum(obs_hwe$homo1>2)
homohet_sum[4,2]=sum(obs_hwe$hets>2)
homohet_sum[4,3]=sum(obs_hwe$homo2>2)
print(homohet_sum)