### initial analysis for species confirmation and identification of bad samples

In [1]:
#input file
vcffile="/home/megan/Desktop/emel_lb1234/new/batch_63.recode.vcf"

#"databases"
samplefile="/home/megan/megan/research/eucalyptus/eucalyptus_data/Emelliodora_PlantsSamples.csv"
accessionfile="/home/megan/megan/research/eucalyptus/eucalyptus_data/Emelliodora_Accessions.csv"

#count header lines in command line (grep -c "##" batch_1.vcf)
skipr=8
#non sample columns
skipc=9

thresh.missing.loci=90  #higher number allows more missing data/keeps more loci
thresh.missing.samples=90  #higher number allows more missing data/keeps more samples

In [2]:
library(adegenet)
library(ape)

Loading required package: ade4

   /// adegenet 2.0.1 is loaded ////////////

   > overview: '?adegenet'
   > tutorials/doc/questions: 'adegenetWeb()' 
   > bug reports/feature requests: adegenetIssues()




In [3]:
#read in vcf, skupping ##header rows
vcf=read.delim(vcffile, sep="\t", header=T, skip=8)
#remove non sample columns
vcf=vcf[c(-skipc:-1)]
#get just genotypes
genos=apply(vcf,2,substr,1,3)
#transpose 
genos=t(genos)
#clean sample names
row.names(genos)=substr(row.names(genos),9,14)
#genos[1:5,1:10]

In [37]:
#add sample metadata
sampleinfo=read.csv(samplefile, header=T)  #read in sample information
samplematches=match(sampleinfo$SampleID, row.names(genos))  #subset for just samples in this analysis
samples=sampleinfo[!is.na(samplematches),][order(na.omit(samplematches)),]

#incorporate latitude and longitude from accession
accessioninfo=read.csv(accessionfile, header=T)  #read in accession information
accessionmatches=match(samples$AccessionID, accessioninfo$AccessionID)
samples=cbind(samples,accessioninfo$Latitude[accessionmatches],accessioninfo$Longitude[accessionmatches])
names(samples)[names(samples)=="accessioninfo$Latitude[accessionmatches]"]="Latitude"
names(samples)[names(samples)=="accessioninfo$Longitude[accessionmatches]"]="Longitude"


Unnamed: 0,PlantID,PlantName,AccessionID,AccessionName,ExperimentID,ExperimentName,ExperimentCondition,GrowthLocation,TrayPosition,SampleID,SampleName,SampleType,PlateName,PlateCoordinate,NameInWell,TubeBarcode,Note,PopulationName,latitude,longitude,Latitude,Longitude
2,P35001,LB-U-1,A25001,LB-U-1,none,none,nature,wild,none,S45001,U-1,leaf,Emelliodora_LB_Block1,A1,U1,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
3,P35002,LB-U-2,A25002,LB-U-2,none,none,nature,wild,none,S45002,U-2,leaf,Emelliodora_LB_Block1,B1,U2,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
4,P35003,LB-U-3,A25003,LB-U-3,none,none,nature,wild,none,S45003,U-3,leaf,Emelliodora_LB_Block1,C1,U3,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
5,P35004,LB-U-4,A25004,LB-U-4,none,none,nature,wild,none,S45004,U-4,leaf,Emelliodora_LB_Block1,D1,U4,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
6,P35005,LB-U-5,A25005,LB-U-5,none,none,nature,wild,none,S45005,U-5,leaf,Emelliodora_LB_Block1,E1,U5,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
7,P35006,LB-U-6,A25006,LB-U-6,none,none,nature,wild,none,S45006,U-6,leaf,Emelliodora_LB_Block1,F1,U6,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
8,P35007,LB-U-7,A25007,LB-U-7,none,none,nature,wild,none,S45007,U-7,leaf,Emelliodora_LB_Block1,G1,U7,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
10,P35009,LB-U-9,A25009,LB-U-9,none,none,nature,wild,none,S45009,U-9,leaf,Emelliodora_LB_Block1,A2,U9,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
11,P35010,LB-U-10,A25010,LB-U-10,none,none,nature,wild,none,S45010,U-10,leaf,Emelliodora_LB_Block1,B2,U10,none,,LB-U,-36.63267,144.3502,-36.63267,144.3502
12,P35011,LB-P-1,A25011,LB-P-1,none,none,nature,wild,none,S45011,P-1,leaf,Emelliodora_LB_Block1,C2,P1,none,,LB-P,-35.81963,145.3119,-35.81963,145.3119


In [None]:
dim(genos)

#identify missing data
missing.filt=genos=="./."                 

#identify bad loci
missPerLocus.filt=colSums(missing.filt)   #count missing data per locus
locus.filt=which(missPerLocus.filt < thresh.missing.loci / 100 * nrow(genos))

#identify bad samples
missPerSample.filt=rowSums(missing.filt)
ind.filt=which(missPerSample.filt < thresh.missing.samples / 100 * ncol(genos))

#filter
genos.filt=genos[ind.filt,locus.filt]
dim(genos.filt)

which(missPerSample.filt > thresh.missing.samples / 100 * ncol(genos))

In [None]:
#put in matrix format
genos.filt[genos.filt == "./."] <- "NA"
genos.filt[genos.filt == "1/1"] <- "0"
genos.filt[genos.filt == "0/1"] <- "1"
genos.filt[genos.filt == "1/0"] <- "1"
genos.filt[genos.filt == "0/0"] <- "2"

In [None]:
#calculate PCA
geno.dist=dist(genos.filt)  #generate distance matrix
sum(is.na(geno.dist))  #count NAs in it
pca=cmdscale(geno.dist,20, eig=T) #PCA

In [None]:
#plot PCA
#pdf(file="pca.pdf")
pca=cmdscale(geno.dist,20,eig=T)
per_expl=round(pca$eig/sum(pca$eig)*100,digits=1)
plot(pca$points[,1:2], pch=".", 
    #xlim=c(-20,-100),ylim=c(8,18),                #change axes to zoom to get names
    xlab=paste("PCA axis 1 (",per_expl[1],"%)"),
    ylab=paste("PCA axis 2 (",per_expl[2],"%)"))
text(pca$points[,1:2], label=rownames(pca$points), cex = .6)
#dev.off()

In [None]:
##plot other PCA dimentions
#par(mfrow=c(3,3), mar=c(4,4,2,1))
#plot(pca$points[,c(1,2)], pch="")
#text(pca$points[,c(1,2)],label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(1,3)], pch="")
#text(pca$points[,c(1,3)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(1,4)], pch="")
#text(pca$points[,c(1,4)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(2,1)], pch="")
#text(pca$points[,c(2,1)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(2,3)], pch="")
#text(pca$points[,c(2,3)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(2,4)], pch="")
#text(pca$points[,c(2,4)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(3,1)], pch="")
#text(pca$points[,c(3,1)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(3,2)], pch="")
#text(pca$points[,c(3,2)], label=rownames(pca$points), cex = .6)
#plot(pca$points[,c(3,4)], pch="")
#text(pca$points[,c(3,4)], label=rownames(pca$points), cex = .6)

##plot dendrogram
#hc <- hclust(geno.dist)
#plot(hc, cex=.5)

##determine outliers
#assigns=cutree(hc, k=3)
#outliers2=subset(assigns, assigns==2)
#outliers3=subset(assigns, assigns==3)
#emels=subset(assigns, assigns==1)
#length(outliers2)
#outliers2
#length(outliers3)
#outliers3
#length(emels)
#assigns