## Run Experiment

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

def calculate_similarity_cosine(item, df):
    result = 0
    for i in df.sample(10)["Text"]:
        vectorizer = CountVectorizer().fit_transform([item, i])
        vectors = vectorizer.toarray()
        cosine_sim = cosine_similarity(vectors)
        result += cosine_sim[0][1]
    return result / 10

def calculate_similarity_jaccard(item, df):
    result = 0
    vectorizer = CountVectorizer(binary=True).fit_transform([item] + df.sample(10)["Text"].tolist())
    item_vector = vectorizer.toarray()[0]
    
    for i in range(1, 11):
        i_vector = vectorizer.toarray()[i]
        jaccard_sim = jaccard_score(item_vector, i_vector)
        result += jaccard_sim
    return result / 10

def run_analysis(lang, method, save_file):
    web_data_route = f'Experiment Data/Web/{lang}/{lang}_Web.gz'
    legal_data_route = f'Experiment Data/Legal/{lang}/{lang}_Law.gz'
    tv_data_route = f'Experiment Data/TV/{lang}/{lang}_TV.gz'

    result_df = pd.DataFrame(columns=['Domain', 'Similarity to TV', 'Similarity to Legal', 'Similarity to Web'])

    web_df = pd.read_csv(web_data_route, compression='gzip')
    legal_df = pd.read_csv(legal_data_route, compression='gzip')
    tv_df = pd.read_csv(tv_data_route, compression='gzip')

    for df, domain in zip([web_df, legal_df, tv_df], ['Web', 'Legal', 'TV']):
        sample = df.sample(40)
        for i in sample["Text"]:
            new_row = pd.DataFrame({'Domain': [domain], 'Similarity to TV': [method(i, tv_df)], 'Similarity to Legal': [method(i, legal_df)], 'Similarity to Web': [method(i, web_df)]})
            result_df = pd.concat([result_df, new_row], ignore_index=True)

    result_df.to_csv(save_file)

In [11]:
for lang in ['EN', 'FI', 'EL', 'PT', 'PL']:
    run_analysis(lang, calculate_similarity_jaccard, f'Results/{lang}_jaccard_results.csv')
    run_analysis(lang, calculate_similarity_cosine, f'Results/{lang}_cosine_results.csv')

  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
  result_df = pd.concat([result_df, new_row], ignore_index=True)
