In [1]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

In [7]:
ngrams = ['bigram', 'trigram', 'fourgram']
version1 = '1584_1883'
version2 = 'na1947'
ngramtypes = ['word', 'char']
freq_cutoff = 0
metrics = ['frequency']

## Similarity

### Between sets

In [8]:
def prepare_data(ngramtype, ngram, version1, version2, freq_cutoff):
        data1 = pd.read_csv(f"Data/{version1}_{ngramtype}_{ngram}.csv", index_col = 0)
        data1 = data1[data1['frequency'] > (freq_cutoff - 1) ]
        data2 = pd.read_csv(f"Data/{version2}_{ngramtype}_{ngram}.csv", index_col = 0)
        data2 = data2[data2['frequency'] > (freq_cutoff - 1)]
        data = data1.merge(data2, on = ngram, how = 'outer')
        data = data.fillna(0)
        return data

In [9]:
def wilcoxon(data, metrics):
    outcome = []
    for metric in metrics:
        metric_x = f"{metric}_x"
        metric_y = f"{metric}_y"
        stat, pvalue = stats.wilcoxon(data[metric_x], data[metric_y])
        if pvalue < 0.05:
            sign = 'yes'
        else:
            sign = 'no'
        outcome.append([ngramtype, version1 + " vs " + version2, ngram, metric, pvalue, sign])
    return outcome

In [17]:
outcomes = []

for ngramtype in ngramtypes:
    for ngram in ngrams:
        print(ngram)
        res = wilcoxon(prepare_data(ngramtype, ngram, version1, version2, freq_cutoff), metrics)
        outcomes.extend(res)

bigram
trigram
fourgram
bigram
trigram
fourgram


In [18]:
outcomes_df = pd.DataFrame(outcomes, columns = ['ngram type','version', 'ngram', 'metric', 'pvalue', 'significant'])

In [19]:
outcomes_df

Unnamed: 0,ngram type,version,ngram,metric,pvalue,significant
0,word,1584_1883 vs na1947,bigram,frequency,0.0,yes
1,word,1584_1883 vs na1947,trigram,frequency,2.887161e-292,yes
2,word,1584_1883 vs na1947,fourgram,frequency,0.06692609,no
3,char,1584_1883 vs na1947,bigram,frequency,0.1092703,no
4,char,1584_1883 vs na1947,trigram,frequency,0.7942572,no
5,char,1584_1883 vs na1947,fourgram,frequency,0.04912891,yes


In [26]:
print(outcomes_df.to_markdown())

|    | ngram type   | version             | ngram    | metric    |       pvalue | significant   |
|---:|:-------------|:--------------------|:---------|:----------|-------------:|:--------------|
|  0 | word         | 1584_1883 vs na1947 | bigram   | frequency | 0            | yes           |
|  1 | word         | 1584_1883 vs na1947 | trigram  | frequency | 2.88716e-292 | yes           |
|  2 | word         | 1584_1883 vs na1947 | fourgram | frequency | 0.0669261    | no            |
|  3 | char         | 1584_1883 vs na1947 | bigram   | frequency | 0.10927      | no            |
|  4 | char         | 1584_1883 vs na1947 | trigram  | frequency | 0.794257     | no            |
|  5 | char         | 1584_1883 vs na1947 | fourgram | frequency | 0.0491289    | yes           |


### Between samples

In [27]:
def prepare_data(ngram, version, freq_cutoff):
        data1 = pd.read_csv(f"Data/{version1}_{ngramtype}_{ngram}_sample1.csv", index_col = 0)
        data1 = data1[data1['frequency'] > (freq_cutoff - 1) ]
        data2 = pd.read_csv(f"Data/{version2}_{ngramtype}_{ngram}_sample2.csv", index_col = 0)
        data2 = data2[data2['frequency'] > (freq_cutoff - 1)]
        data = data1.merge(data2, on = ngram, how = 'outer')
        data = data.fillna(0)
        return data

In [29]:
version1 = '1584_1883'
version2 = '1584_1883'

outcomes = []

for ngramtype in ngramtypes:
    for ngram in ngrams:
        print(ngram)
        res = wilcoxon(prepare_data(ngram, '1584_1883', freq_cutoff), metrics)
        outcomes.extend(res)

bigram
trigram
fourgram
bigram
trigram
fourgram


In [31]:
outcomes_df = pd.DataFrame(outcomes, columns = ['ngram type','version', 'ngram', 'metric', 'pvalue', 'significant'])
outcomes_df

Unnamed: 0,ngram type,version,ngram,metric,pvalue,significant
0,word,1584_1883 vs 1584_1883,bigram,frequency,3.264244e-12,yes
1,word,1584_1883 vs 1584_1883,trigram,frequency,1.3727060000000001e-27,yes
2,word,1584_1883 vs 1584_1883,fourgram,frequency,2.322993e-34,yes
3,char,1584_1883 vs 1584_1883,bigram,frequency,1.4472110000000001e-22,yes
4,char,1584_1883 vs 1584_1883,trigram,frequency,3.862959e-28,yes
5,char,1584_1883 vs 1584_1883,fourgram,frequency,2.72401e-23,yes


In [32]:
version1 = 'na1947'
version2 = 'na1947'
outcomes = []

for ngramtype in ngramtypes:
    for ngram in ngrams:
        print(ngram)
        res = wilcoxon(prepare_data(ngram, 'na1947', freq_cutoff), metrics)
        outcomes.extend(res)

bigram
trigram
fourgram
bigram
trigram
fourgram


In [34]:
outcomes_df = pd.DataFrame(outcomes, columns = ['ngram type','version', 'ngram', 'metric', 'pvalue', 'significant'])
outcomes_df

Unnamed: 0,ngram type,version,ngram,metric,pvalue,significant
0,word,na1947 vs na1947,bigram,frequency,0.101108,no
1,word,na1947 vs na1947,trigram,frequency,0.176628,no
2,word,na1947 vs na1947,fourgram,frequency,0.225586,no
3,char,na1947 vs na1947,bigram,frequency,0.006923,yes
4,char,na1947 vs na1947,trigram,frequency,0.006156,yes
5,char,na1947 vs na1947,fourgram,frequency,0.00252,yes


## Overlap

In [35]:
def prepare_data_overlap(ngram, version1, version2, freq_cutoff):
        data1 = pd.read_csv(f"Data/{version1}_{ngramtype}_{ngram}.csv", index_col = 0)
        data1 = data1[data1['frequency'] > (freq_cutoff - 1) ]
        data2 = pd.read_csv(f"Data/{version2}_{ngramtype}_{ngram}.csv", index_col = 0)
        data2 = data2[data2['frequency'] > (freq_cutoff - 1)]
        data1 = data1[ngram].to_list()
        data2 = data2[ngram].to_list()
        return data1, data2

In [36]:
version1 = '1584_1883'
version2 = 'na1947'

for ngramtype in ngramtypes:
    for ngram in ngrams:
        #print(ngram)
        data1, data2 = prepare_data_overlap(ngram, version1, version2, freq_cutoff)
        len1 = len(set(data1))
        #print(len1)
        len2 = (len(set(data2)))
        #print(len2)
        overlap = set(data1).intersection(set(data2))
        #print(len(overlap))
        print(f'Overlap {ngramtype}, {ngram}, {version1} : {((len(overlap)/len1)*100)}')
        print(f'Overlap {ngramtype}, {ngram},{version2} : {((len(overlap)/len2)*100)}')

Overlap word, bigram, 1584_1883 : 14.301498420054127
Overlap word, bigram,na1947 : 16.759149516934237
Overlap word, trigram, 1584_1883 : 5.894678520160173
Overlap word, trigram,na1947 : 6.4417760432864455
Overlap word, fourgram, 1584_1883 : 2.1338118438626736
Overlap word, fourgram,na1947 : 2.197427000768216
Overlap char, bigram, 1584_1883 : 70.5
Overlap char, bigram,na1947 : 80.31645569620252
Overlap char, trigram, 1584_1883 : 71.66527323858021
Overlap char, trigram,na1947 : 76.5713821666396
Overlap char, fourgram, 1584_1883 : 63.12237945492662
Overlap char, fourgram,na1947 : 68.30305715551617


In [1]:
def venn(ngramtype, ngram, version1, version2):   
    data1 = pd.read_csv(f"Data/{version1}_{ngramtype}_{ngram}.csv", index_col = 0)
    data1 = data1[data1['frequency'] > (freq_cutoff - 1) ]
    data2 = pd.read_csv(f"Data/{version2}_{ngramtype}_{ngram}.csv", index_col = 0)
    data2 = data2[data2['frequency'] > (freq_cutoff - 1)]
    data_grams1 = data1[ngram].to_list()
    data_grams2 = data2[ngram].to_list()
    venn2([set(data_grams1), set(data_grams2)], 
    set_colors=('#3E64AF', '#3EAF5D'), 
    set_labels = (f'{ngramtype} {ngram} {version1}', 
                    f'{ngramtype} {ngram} {version2}'),
                      alpha=0.75)
    plt.title("Cut-off freqiencies at 0")
    venn2_circles([set(data_grams1), set(data_grams2)], lw=0.7)
    return plt

In [2]:
for ngramtype in ngramtypes:
    for ngram in ngrams:
        print(ngram)
        plt =  venn(ngramtype, ngram, version1, version2)
        plt.savefig(f"Images/{ngramtype}{ngram}.jpg")
        plt.show()

NameError: name 'ngramtypes' is not defined