### Purpose of this notebook

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import matplotlib.cm as cm
#from IPython.display import JSON
import math
import inspect
from typing import List, Dict, Optional

### Functions

In [2]:
def show_df_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prints information about a DataFrame, including column names, data types, and non-null counts.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
        
    Returns:
        pandas.DataFrame: A DataFrame containing the column names, data types, and non-null counts, and null counts of the input DataFrame.
    """
    col_names = df.columns.to_list()
    col_dtypes = df.dtypes.to_list()
    non_null_counts = df.count().to_list()
    null_counts = df.isnull().sum().to_list()
    info_df = pd.DataFrame({'column_name': col_names, 'dtype': col_dtypes, 'non_null_count': non_null_counts, 'null_count': null_counts})

    caller_frame = inspect.currentframe().f_back
    df_name = [var_name for var_name, var_val in caller_frame.f_locals.items() if var_val is df][0]

    print(f"DataFrame '{df_name}' has {len(df)} rows and {len(df.columns)} columns.")
    print("Here is a summary of the column names, data types and null counts:")
    return info_df

In [3]:
def get_unique_links(df):
    #unique_links = df['link'].unique()
    unique_links = df['link'].unique().tolist()
    return unique_links

In [4]:
def get_nonunique_links(df):
    #nonunique_links = df[df.duplicated(['link'])]
    #nonunique_links = df[df.duplicated(['link'])]['link']
    nonunique_links = df[df.duplicated(['link'])]['link'].tolist()
    return nonunique_links

In [5]:
def multiplotv_dark(df: pd.DataFrame, filter_col: str = None, fmin: float = None, fmax: float = None) -> None:
    """
    Generate a set of 4 plots to visualize the relationship between two variables in a DataFrame.
    :param df: DataFrame containing the data
    :param filter_col: Name of the column to filter on (optional)
    :param filter_min: Minimum value for the filter_col (optional)
    :param filter_max: Maximum value for the filter_col (optional)
    """

    # define subplots
    face_color = '0.1'
    fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16, 12), facecolor=face_color)
    
    # Set background color to black
    #plt.style.use('dark_background')

    # apply filter and subtitle
    if filter_col:
        if fmin is not None and fmax is not None:
            if fmin < fmax:
                fig.text(0.5, 0.94, f'Filtered on {fmin} < {filter_col} < {fmax}', ha='center', fontsize=16, color='white')
                df = df[(df[filter_col] <= fmax) & (df[filter_col] >= fmin)]
            else:
                fmin, fmax = fmax, fmin
                fig.text(0.5, 0.94, f'Filtered on {fmin} > {filter_col} > {fmax}', ha='center', fontsize=16, color='white')
                df = df[(df[filter_col] >= fmax) | (df[filter_col] <= fmin)]
        elif fmin is not None and fmax is None:
            fig.text(0.5, 0.94, f'Filtered on {filter_col} > {fmin}', ha='center', fontsize=16, color='white')
            df = df[df[filter_col] >= fmin]
        elif fmin is None and fmax is not None:
            fig.text(0.5, 0.94, f'Filtered on {filter_col} < {fmax}', ha='center', fontsize=16, color='white')
            df = df[df[filter_col] <= fmax]
        else:
            print(f'Warning: you must provide a min and/or max on which to filter on {filter_col} or no filtering will occur')

    # apply main title
    main_title = f'Correlation Study: Rank vs Percent Human ({len(df)} data points)'
    fig.suptitle(main_title, fontsize=20, fontweight='bold', y=.98, color='white')
    
    
    # Fig 1: Q-Q plot [0, 0]
    st.probplot(df['percent_human'], dist='norm', plot=axs[0, 0])
    axs[0, 0].set_title('Figure 1: Q-Q Plot of Percent Human')
    axs[0, 0].set_xlabel('Theoretical quantiles')
    axs[0, 0].set_ylabel('Sample quantiles')
    axs[0, 0].text(0.02, 0.95, 'Data is not normally distributed\nand cannot be evaluated using Pearson\'s method', transform=axs[0, 0].transAxes, fontsize=12, verticalalignment='top', color='white')

    
    # Fig 2: Histogram [0, 1]
    sns.histplot(data=df['percent_human'], kde=False, binwidth=1, color='green', ax=axs[0, 1], edgecolor='grey')
    axs[0, 1].set_title('Figure 2: Histogram of Percent Human')
    axs[0, 1].set_xlabel('Percent Human')
    axs[0, 1].set_ylabel('Count')
    axs[0, 1].set_xlim(-5, 105)
    axs[0, 1].set_ylim(0, None)
    skewness = round(st.skew(df['percent_human']), 2)
    axs[0, 1].text(0.02, 0.95, f'Data Skewness {skewness}', transform=axs[0, 1].transAxes, fontsize=12, verticalalignment='top', color='white')
    #axs[0, 1].text(0.02, 0.90, f'Over 1/3 of data in top 2%', transform=axs[0, 1].transAxes, fontsize=12, verticalalignment='top', color='black')

    
    # Fig 3: Violin plot [1, 0]
    sns.violinplot(ax=axs[1, 0], data=df, x="rank", y="percent_human", scale="count", inner="box", color="#00aa00", saturation=0.5, cut=0, linewidth=.9)

        # plot mean lines
    for i, mean in enumerate(df.groupby("rank")["percent_human"].mean()):
        axs[1, 0].hlines(mean, i-0.25, i+0.25, linewidth=1, color='#bbbbbb', zorder=100)

        # plot median markers
    median_markers = df.groupby('rank')['percent_human'].median()
    sns.scatterplot(ax=axs[1, 0], x=median_markers.index-1, y=median_markers.values, marker='o', s=20, color='white', edgecolor='black', zorder=100)
    
    axs[1, 0].set_title('Figure 3: Percent Human vs Rank')
    axs[1, 0].set_xlabel('Rank')
    axs[1, 0].set_ylabel('Percent Human')

   
    # Fig 4: Scatter plot [1, 1]
    grouped_data = df.groupby('rank')
    mean_percent_human = grouped_data['percent_human'].mean().reset_index()
    sns.scatterplot(ax=axs[1, 1], data=mean_percent_human, x='rank', y='percent_human',  color='green', edgecolor=None)
    sns.regplot(ax=axs[1, 1], data=mean_percent_human, x='rank', y='percent_human', color='green', scatter=False, line_kws={'linestyle':'--'})
    axs[1, 1].set_title('Figure 4: Mean Percent Human vs Rank')
    axs[1, 1].set_xlabel('Rank')
    axs[1, 1].set_ylabel('Mean Percent Human')
    axs[1, 1].set_xticks(np.arange(1, 21))
    axs[1, 1].legend(handles=axs[1, 1].lines[::len(mean_percent_human)], labels=['Best fit line'], facecolor=face_color, labelcolor='white')
    axs[1, 1].text(0.02, 0.05, f'Percent Human is weakly correlated to Rank with a high degree of certainty', transform=axs[1, 1].transAxes, fontsize=10, verticalalignment='top', color='white')

        # Calculate correlation coefficients and p-values
    pb_corr, pb_pval = st.pointbiserialr(df['rank'], df['percent_human'])
    spearman_corr, spearman_pval = st.spearmanr(df['rank'], df['percent_human'])
    kendall_tau, kendall_pval = st.kendalltau(df['rank'], df['percent_human'])

        # Create a dictionary to store the results
    corr = {
        'Method': ['Point Biserial', 'Spearman', 'Kendall'],
        'Corr Coef': [pb_corr, spearman_corr, kendall_tau],
        'P-value': [pb_pval, spearman_pval, kendall_pval]
    }

        # Create a pandas dataframe from the corr dict
    df_corr = pd.DataFrame(corr)

        # Create a table to display the correlation coefficients and p-values
    table_data = [list(df_corr.columns)] + df_corr.values.tolist()
    table = axs[1, 1].table(cellText=table_data,
                    colLabels=None,
                    cellLoc='center',
                    bbox=[0.02, 0.08, 0.5, 0.15],  # x, y, width, height
                    cellColours=[[face_color]*3] + [[face_color]*3]*len(df_corr))
    table.set_fontsize(10)
    for cell in table.get_celld().values():
        cell.set_linewidth(0)
        cell.set_edgecolor('none')
        cell.set_text_props(color='white')


        # Format the table to use 4 significant figures
    for i in range(1, len(table_data)):
        for j in range(1, 3):
            cell = table.get_celld()[i, j]
            cell_text = cell.get_text().get_text()
            cell_text = float(cell_text)
            cell_text = f'{cell_text:.4g}'
            cell.get_text().set_text(cell_text)
            #cell.set_edgecolor('white')
    
    font_color = '#cccccc'
    for ax in axs.flat:
        ax.tick_params(axis='x', labelsize=12, colors=font_color)
        ax.tick_params(axis='y', labelsize=12, colors=font_color)
        ax.set_xlabel(ax.get_xlabel(), fontdict={'fontsize': 14, 'color': font_color})
        ax.set_ylabel(ax.get_xlabel(), fontdict={'fontsize': 14, 'color': font_color})
        ax.set_title(ax.get_title(), fontdict={'fontsize': 16, 'color': 'white'})
        ax.set_facecolor(face_color)
        for spine in ax.spines.values():
            spine.set_edgecolor('white')
            #spine.set_linewidth(2)
    
    # shrink outer margin
    fig.subplots_adjust(left=0.05, right=0.95, top=.90, bottom=0.05)
   
    fig.savefig('figures/Fig_1-4.png')
    plt.show()

In [6]:
def get_highest_kw_per_link(df, links):
    highest_kw_per_link = list()
    for link in links:
        # Select rows with the given link
        link_df = df[df['link'] == link]
        # Count the number of occurrences of each 'kw' value
        kw_counts = link_df['kw'].value_counts()
        # Get the highest 'kw' value, or None if there are no 'kw' values
        highest_kw = kw_counts.index[0] if len(kw_counts) > 0 else None
        # Append the highest 'kw' value for this link to the output list
        highest_kw_per_link.append(highest_kw)
    return highest_kw_per_link

### Execution

In [16]:
df = pd.read_csv('data/data_clean.old.csv')
df.head(1)

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai
0,how to finish concrete,1,https://www.familyhandyman.com/project/how-to-...,True,1689.0,99.926081,0.073917


In [8]:
len(df)

14637

In [9]:
#df['uid'] = df['kw'] + '*' + df['link']
#df['uid'] = df.apply(lambda row: f"{row['kw']}_{row['link']}", axis=1)
#df['uid'] = df.apply(lambda row: f"{row['kw']}_{row['rank']}_{row['link']}", axis=1)
df['uid'] = df.apply(lambda row: f"{row['kw']}_{row['link']}", axis=1)
df['uid'] = df['uid'].str.strip()

In [10]:
df['uid'].nunique()

14371

In [11]:
result = df[df['uid'].duplicated(keep=False)]

In [12]:
result

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid
36,early stage blood clot in foot pictures,9,https://stock.adobe.com/search?k=blood%20clots...,True,231.0,3.062132,96.937871,early stage blood clot in foot pictures_https:...
43,early stage blood clot in foot pictures,17,https://stock.adobe.com/search?k=blood%20clots...,True,231.0,3.062132,96.937871,early stage blood clot in foot pictures_https:...
91,diastasis,15,https://www.medicalnewstoday.com/articles/dias...,True,923.0,96.124017,3.875982,diastasis_https://www.medicalnewstoday.com/art...
93,diastasis,17,https://dianeleephysio.com/education/diastasis...,True,4885.0,97.989366,2.010634,diastasis_https://dianeleephysio.com/education...
94,diastasis,18,https://www.medicalnewstoday.com/articles/dias...,True,923.0,96.124017,3.875982,diastasis_https://www.medicalnewstoday.com/art...
...,...,...,...,...,...,...,...,...
14590,bilateral pneumonia,18,https://www.cidrap.umn.edu/argentine-officials...,True,636.0,96.696997,3.303000,bilateral pneumonia_https://www.cidrap.umn.edu...
14591,bilateral pneumonia,19,https://medlineplus.gov/pneumonia.html,True,515.0,1.570927,98.429072,bilateral pneumonia_https://medlineplus.gov/pn...
14592,bilateral pneumonia,20,https://www.cidrap.umn.edu/argentine-officials...,True,636.0,96.696997,3.303000,bilateral pneumonia_https://www.cidrap.umn.edu...
14635,ruched midi dress,19,https://www.loft.com/petites/petite-dresses/ca...,True,1602.0,92.680357,7.319643,ruched midi dress_https://www.loft.com/petites...


In [13]:
df['uid'][0]

'how to finish concrete_https://www.familyhandyman.com/project/how-to-finish-concrete/'

In [14]:
df.head(1)

Unnamed: 0,kw,rank,link,success,word_count,percent_human,percent_ai,uid
0,how to finish concrete,1,https://www.familyhandyman.com/project/how-to-...,True,1689.0,99.926081,0.073917,how to finish concrete_https://www.familyhandy...


In [15]:
combined_df = pd.read_csv('data/linked_kw_finalcombined.csv', header=0)
combined_df['link'] = combined_df['url']
combined_df = combined_df.drop(['url'], axis=1)
#combined_df['uid'] = combined_df['kw'] + '*' + combined_df['link']
#combined_df['uid'] = combined_df.apply(lambda row: f"{row['kw']}_{row['rank']}_{row['link']}", axis=1)
combined_df['uid'] = combined_df.apply(lambda row: f"{row['kw']}_{row['link']}", axis=1)
#combined_df['uid'] = combined_df['uid'].astype(str)
combined_df['uid'] = combined_df['uid'].str.strip()
#combined_df = combined_df.drop(['kw'], axis=1)

FileNotFoundError: [Errno 2] No such file or directory: 'data/linked_kw_finalcombined.csv'

In [None]:
kwf = pd.read_csv('data/linked_kw_final.csv', header=0)
len(kwf)
kwf.head()

In [None]:
combined_df[combined_df['link']=='https://www.amazon.com/Side-Table-Lamps/s?k=Side+Table+Lamps']

In [None]:
df1 = pd.read_csv('data/webmd.csv')
df2 = pd.read_csv('data/thespruce.csv')
df3 = pd.read_csv('data/amazon.csv')
new = pd.concat([df1, df2, df3])
new = new.reset_index(drop=True)
new = new.copy(deep=True)
drop_columns = ['Current position', 'Current URL inside', 'Updated']
new = new.drop(drop_columns, axis=1)
new = new.rename(columns={'Current URL': 'url'})
new = new.rename(columns={'Keyword': 'kw'})
new = new.copy(deep=True)
#new['uid'] = new['kw'] + '*' + new['url']
#new['uid'] = new.apply(lambda row: f"{row['kw']}_{row['url']}", axis=1)
#new['uid'] = new.apply(lambda row: f"{row['kw']}_{row['rank']}_{row['url']}", axis=1)
new['uid'] = new.apply(lambda row: f"{row['kw']}_{row['url']}", axis=1)
new['uid'] = new['uid'].str.strip()

In [None]:
new.head(1)

In [None]:
new['uid'].nunique()

In [None]:
len(new)

In [None]:
show_df_info(new)

In [None]:
new.head(1)

In [None]:
new['uid'][0]

In [None]:
new[new['url']=='https://www.amazon.com/Side-Table-Lamps/s?k=Side+Table+Lamps']

In [None]:
new[new['uid']=='side table lamps_https://www.target.com/c/table-lamps-lighting-home-decor/-/N-56d7t']

In [None]:
new[new['kw']=='side table lamps']

In [None]:
df1 = pd.merge(left=df, right=new, on='uid', how='outer')
#df1 = pd.merge(left=df, right=combined_df, on='uid', how='outer')

In [None]:
df1.head(1)

In [None]:
df1.head(1)

In [None]:
show_df_info(df1)

In [None]:
#result = df1.loc[df1['kw_x'] == df1['kw_y'], ['uid', 'link_x', 'link_y', 'kw_x', 'kw_y']]
result = df1.loc[df1['kw_x'] == df1['kw_y'], ['uid', 'url', 'link', 'kw_x', 'kw_y']]

In [None]:
df1.head()

In [None]:
df1['uid'][0]

In [None]:
result.head()

In [None]:
def count_unique(df, col):
    count_dict = {}
    for uid in df[col].unique():
        count_dict[uid] = df[df[col] == uid][col].count()
    # sort the dictionary by value in descending order
    sorted_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True))
    return sorted_dict

In [None]:
uids = count_unique(df, 'uid')

In [None]:
#uids['trash bin*https://www.vocabulary.com/dictionary/trash%20bin']

In [None]:
len(df)

In [None]:
unique_links = get_unique_links(df)
len(unique_links)

In [None]:
nonunique_links = get_nonunique_links(df)
len(nonunique_links)

In [None]:
df_unique_links = df[~df['link'].isin(nonunique_links)]
len(df_unique_links)

In [None]:
df_unique_links.head(10).sort_values(by='link')

In [None]:
df_nonunique_links = df[df['link'].isin(nonunique_links)]
len(df_nonunique_links)


In [None]:
df_nonunique_links.head(10).sort_values(by='link')

In [None]:
most_kw = get_highest_kw_per_link(df_nonunique_links, nonunique_links)
len(most_kw)

In [None]:
df_nonunique_links_most_kw = df_nonunique_links[df_nonunique_links['link'].isin(most_kw)]
len(df_nonunique_links_most_kw)

In [None]:
len(df_unique_links) + len(df_nonunique_links)

In [None]:
len(df_unique_links) + 580

In [None]:
hi_kw = get_highest_kw_per_link(df, unique_links)

In [None]:
len(hi_kw)

In [None]:
show_df_info(df)

In [None]:
multiplotv_dark(df, 'percent_human', 0, 74)

In [None]:
multiplotv_dark(df_unique_links, 'percent_human', 0, 74)

In [None]:
multiplotv_dark(df_nonunique_links, 'percent_human', 0, 74)

In [None]:
multiplotv_dark(df_nonunique_links)

In [None]:
show_df_info(combined_df)

In [None]:
show_df_info(df)

In [None]:
#for row in df, combined_df:
#    if df['link']==combined_df['link']:
#        print('true')

In [None]:
cdf1 = combined_df[combined_df['link'].isin(df['link'])]
cdf1.head(1)

In [None]:
cdf2 = cdf1[cdf1['kw'].isin(df['kw'])]
len(cdf1)

In [None]:
len(cdf1)

In [None]:
cdf3 = combined_df[combined_df['kw'].isin(df['kw'])]
len(cdf3)

In [None]:
cdf4 = cdf3[cdf3['link'].isin(df['link'])]
len(cdf4)

In [None]:
count_unique(df, 'link')

In [None]:
df[df['link']=='https://www.homepest.com/blog/are-wolf-spiders-poisonous']

In [None]:
print(df[df['link']=='https://www.homepest.com/blog/are-wolf-spiders-poisonous']['link'])

In [None]:
combined_df[combined_df['link']=='https://www.homepest.com/blog/are-wolf-spiders-poisonous\n']

In [None]:
new[new['url']=='https://www.homepest.com/blog/are-wolf-spiders-poisonous']