In [3]:
import random
import numpy as np

In [4]:
random.seed(42)
def subsample_sequences(alignment_file, n):
    # Read the alignment file into a list of tuples, where each tuple contains the header and sequence
    with open(alignment_file, 'r') as f:
        next(f)
        lines = [line.strip() for line in f]
    headers_and_sequences = [(lines[i], lines[i+1]) for i in range(0, len(lines), 2) if i+1 < len(lines)]
    # Choose n random sequences from the list
    subsampled_sequences = random.sample(headers_and_sequences, n)
    
    return subsampled_sequences

In [23]:
def write_subsampled_alignment(subsampled_sequences, output_file):
    # Open the output file for writing
    with open(output_file, 'w') as f:
        # Write main header
        f.write('#130\t1' + '\n')
        f.write('>101' + '\n')
        f.write('KVFERCELARTLKRLGMDGYRGISLANWMCLAKWESGYNTRATNYNAGDRSTDYGIFQINSRYWCNDGKTPGAVNACHLSCSALLQDNIADAVACAKRVVRDPQGIRAWIAWRNRCQNRDVRQYVQGCGV' + '\n')
        # Write each header and sequence to the output file
        for header, sequence in subsampled_sequences:
            f.write(header + '\n')
            f.write(sequence + '\n')


In [24]:
subsampled_sequences = subsample_sequences('/Users/holger/Desktop/master_thesis/data/results_af2/P61626/P61626_default/P61626_1gbw.result/P61626_1gbw_c9d43.a3m',64)
write_subsampled_alignment(subsampled_sequences, '/Users/holger/Desktop/master_thesis/data/results_af2/P61626/MSA_64/P61626_1gbw_msa_64.a3m')

In [11]:
def subsample_sequences_uniform(alignment_file, intervals):
    # Read the alignment file into a list of tuples, where each tuple contains the header and sequence
    with open(alignment_file, 'r') as f:
        next(f)
        lines = [line.strip() for line in f]
    headers_and_sequences = np.asarray([(lines[i], lines[i+1]) for i in range(0, len(lines), 2) if i+1 < len(lines)])
    # Choose n random sequences from the list
    idx = np.asarray([random.randint(intervals[i][0],intervals[i][1]) for i in range(len(intervals))])
    print(idx)
    headers_and_sequences = headers_and_sequences[idx]
    #subsampled_sequences = random.sample(headers_and_sequences, n)
    
    return headers_and_sequences

In [23]:
# Uniform subsample
# P0A7Y4: 1kvb
#16098.0/32 ≈ 503

intervals = [[i*503-503,i*503] for i in range(1,33)]
INDIR = '/Users/holger/Desktop/master_thesis/data/results_af2/P0A7Y4/P0A7Y4_1kvb.result/P0A7Y4_1kvb_90ba3.a3m'
OUTDIR = '/Users/holger/Desktop/master_thesis/data/results_msa/P0A7Y4_1kvb_MSA_32/'
for i in range(1,11):
    random.seed(i)
    subsampled_sequences_u = subsample_sequences_uniform(INDIR, intervals)
    subsampled_sequences_u = subsampled_sequences_u.tolist()
    write_subsampled_alignment(subsampled_sequences_u, OUTDIR + 'P0A7Y4_1kvb_MSA_32_{}.a3m'.format(i))
    

[   68   794  1439  1919  2403  2547  3148  3581  4277  4916  5260  5774
  6369  6733  7445  7652  8096  8800  9068 10014 10487 10762 11287 11880
 12462 12967 13079 13937 14312 14723 15459 16003]
[  489   944  1491  1943  2040  2561  3061  3705  4451  4613  5406  5947
  6378  6976  7199  7673  8358  8659  9364  9575 10357 10911 11147 11789
 12398 12776 13489 13951 14524 14847 15576 15783]
[  121   806  1284  1575  2201  2983  3327  3763  4344  4824  5063  5843
  6042  7004  7470  7785  8180  8833  9173  9655 10427 10803 11342 11997
 12353 12818 13281 13908 14524 14664 15208 15918]
[  120   658  1058  1878  2214  2760  3097  3567  4058  4537  5235  5814
  6505  6687  7451  7936  8078  8664  9320  9831 10244 10704 11465 11657
 12495 12629 13212 13690 14566 15061 15103 16017]
[  318   633  1385  1692  2419  2868  3500  3951  4402  4860  5502  5804
  6050  6969  7280  7942  8530  8678  9386  9583 10521 10643 11123 11759
 12312 13019 13204 13775 14362 14639 15383 15720]
[  406   796  1426  

In [47]:
# Uniform subsample
# P61626: 1gbz
#4942.0/32 ≈ 503

intervals = [[i*154-154,i*154] for i in range(1,33)]
INDIR = '/Users/holger/Desktop/master_thesis/data/results_af2/P61626/P61626_default/P61626_1gbz.result/P61626_1gbz_8966e.a3m'
OUTDIR = '/Users/holger/Desktop/master_thesis/data/results_msa/P61626_1gbz_MSA_32/'
for i in range(1,11):
    random.seed(i)
    subsampled_sequences_u = subsample_sequences_uniform(INDIR, intervals)
    subsampled_sequences_u = subsampled_sequences_u.tolist()
    write_subsampled_alignment(subsampled_sequences_u, OUTDIR + 'P61626_1gbz_MSA_32_{}.a3m'.format(i))
    

[  34  299  324  527  646  896 1039 1198 1329 1439 1564 1818 1855 2101
 2266 2310 2578 2686 2830 3077 3106 3315 3395 3547 3702 3988 4006 4255
 4367 4574 4627 4909]
[  14  177  329  554  659  848  988 1132 1241 1534 1580 1804 1948 2132
 2251 2449 2577 2746 2840 2935 3087 3327 3507 3623 3793 3958 4138 4200
 4455 4511 4680 4833]
[  60  305  447  495  710  924 1045 1226 1248 1389 1660 1760 1989 2061
 2205 2430 2602 2758 2893 3027 3118 3293 3426 3675 3795 3853 4020 4198
 4463 4476 4697 4781]
[  60  231  334  563  738  809  947 1095 1237 1488 1680 1768 1863 2058
 2289 2447 2556 2688 2816 2953 3147 3288 3394 3608 3765 3899 4046 4237
 4386 4561 4642 4860]
[  65  245  443  469  735  833  937 1118 1260 1481 1660 1757 1945 2141
 2182 2456 2527 2621 2827 3030 3151 3280 3487 3582 3714 3885 4117 4190
 4345 4466 4621 4827]
[ 146  174  432  528  625  770  961 1228 1352 1481 1621 1699 1917 2127
 2206 2415 2601 2756 2796 2975 3224 3375 3455 3564 3804 3935 4027 4250
 4416 4530 4733 4798]
[  82  192  409 

In [48]:
# Uniform subsample
# P61626: 1gft
#4914.0/32 ≈ 503

intervals = [[i*153-153,i*153] for i in range(1,33)]
INDIR = '/Users/holger/Desktop/master_thesis/data/results_af2/P61626/P61626_default/P61626_1gft.result/P61626_1gft_b1c92.a3m'
OUTDIR = '/Users/holger/Desktop/master_thesis/data/results_msa/P61626_1gft_MSA_32/'
for i in range(1,11):
    random.seed(i)
    subsampled_sequences_u = subsample_sequences_uniform(INDIR, intervals)
    subsampled_sequences_u = subsampled_sequences_u.tolist()
    write_subsampled_alignment(subsampled_sequences_u, OUTDIR + 'P61626_1gft_MSA_32_{}.a3m'.format(i))
    

[  34  298  322  524  642  891 1033 1191 1321 1430 1554 1807 1843 2088
 2252 2295 2562 2669 2812 3058 3086 3294 3373 3524 3678 3963 3980 4228
 4339 4545 4597 4878]
[  14  176  327  551  655  843  982 1125 1233 1525 1570 1793 1936 2119
 2237 2434 2561 2729 2822 2916 3067 3306 3485 3600 3769 3933 4112 4173
 4427 4482 4650 4802]
[  60  304  445  492  706  886 1066 1087 1227 1497 1596 1824 1895 2038
 2262 2433 2588 2722 2855 2945 3119 3251 3499 3618 3675 3841 4018 4282
 4294 4514 4597 4811]
[  60  230  332  560  734  804  941 1088 1229 1479 1670 1757 1851 2045
 2275 2432 2540 2671 2798 2934 3127 3267 3372 3585 3741 3874 4020 4210
 4358 4532 4612 4829]
[  65  244  441  466  731  828  931 1111 1252 1472 1650 1746 1933 2128
 2168 2441 2511 2604 2809 3011 3131 3259 3465 3559 3690 3860 4091 4163
 4317 4437 4591 4796]
[ 146  173  430  525  621  765  955 1221 1344 1472 1611 1688 1905 2114
 2192 2400 2585 2739 2778 2956 3204 3354 3433 3541 3780 3910 4001 4223
 4388 4501 4703 4767]
[  82  191  407 

In [49]:
len('MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFETGTWDAYKNL')

164