In [1]:
import pandas as pd
from typing import List, Dict, Optional
import inspect

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)

# This notebook is for cleaning the input dataset
* Removal of non-ASCII kw
* Removal of offencive kw and URLs
* Dropping of columns not used

In [3]:
def remove_non_ascii_kw(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows from a DataFrame where the "Keyword" column contains non-ASCII characters.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
    
    Returns:
        pandas.DataFrame: The modified DataFrame with rows removed.
    """
    # Remove rows where "Keyword" contains non-ASCII characters
    df_start_len = len(df)
    df = df[df['kw'].apply(lambda x: all(ord(c) < 128 for c in x))]
    
    # Print some statistics
    print(f"df start length: {df_start_len}")
    print(f"df final length: {len(df)}")
    print(f"records removed: {df_start_len - len(df)}")
    
    # Return the modified DataFrame
    return df


In [4]:
def fix_nan(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replaces NaN values in the "SERP features" and "CPC" columns of a pandas DataFrame with appropriate default values.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
    
    Returns:
        pandas.DataFrame: The modified DataFrame with NaN values replaced.
    """
    # Count NaN values in "SERP features" and "CPC" columns
    serp_nan = df['SERP features'].isna().sum()
    cpc_nan = df['CPC'].isna().sum()
    
    # Replace NaN values with appropriate default values
    df['SERP features'] = df['SERP features'].fillna('None')
    df['CPC'] = df['CPC'].fillna(0)
    
    # Print some statistics
    print(f"SERP features NaN records replaced with 'None': {serp_nan}")
    print(f"CPC NaN records replaced with 0: {cpc_nan}")
    
    # Return the modified DataFrame
    return df

In [5]:
def remove_blocklist_kw(df: pd.DataFrame, kw_blocklist: List[str]) -> pd.DataFrame:
    """
    Removes rows from a pandas DataFrame where the "Keyword" column contains any of the keywords in a blocklist.

    Args:
        df (pandas.DataFrame): The DataFrame to process.
        kw_blocklist (List[str]): A list of keywords to remove from the DataFrame.

    Returns:
        pandas.DataFrame: The modified DataFrame with the specified keywords removed.
    """
    # Get the length of the input DataFrame
    df_start_len = len(df)

    # Create a regular expression pattern from the blocklist
    kw_pattern = '|'.join(kw_blocklist)

    # Create a mask for rows containing keywords in the blocklist
    kw_mask = df['kw'].str.contains(kw_pattern, case=False)

    # Use the mask to remove the specified keywords from the DataFrame
    df = df[~kw_mask]

    # Print some statistics
    print(f"df start length: {df_start_len}")
    print(f"df final length: {len(df)}")
    print(f"records removed: {df_start_len - len(df)}")

    # Return the modified DataFrame
    return df

In [6]:
def remove_blocklist_url(df: pd.DataFrame, url_blocklist: List[str]) -> pd.DataFrame:
    """
    Removes rows from a pandas DataFrame where the "URL" column contains any of the URLs in a blocklist.

    Args:
        df (pandas.DataFrame): The DataFrame to process.
        url_blocklist (List[str]): A list of URLs to remove from the DataFrame.

    Returns:
        pandas.DataFrame: The modified DataFrame with the specified URLs removed.
    """
    # Get the length of the input DataFrame
    df_start_len = len(df)

    # Create a regular expression pattern from the blocklist
    url_pattern = '|'.join(url_blocklist)

    # Create a mask for rows containing URLs in the blocklist
    url_mask = df['url'].str.contains(url_pattern, case=False)

    # Use the mask to remove the specified URLs from the DataFrame
    df = df[~url_mask]

    # Print some statistics
    print(f"df start length: {df_start_len}")
    print(f"df final length: {len(df)}")
    print(f"records removed: {df_start_len - len(df)}")

    # Return the modified DataFrame
    return df

In [7]:
def show_df_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prints information about a DataFrame, including column names, data types, and non-null counts.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
        
    Returns:
        pandas.DataFrame: A DataFrame containing the column names, data types, and non-null counts, and null counts of the input DataFrame.
    """
    col_names = df.columns.to_list()
    col_dtypes = df.dtypes.to_list()
    non_null_counts = df.count().to_list()
    null_counts = df.isnull().sum().to_list()
    info_df = pd.DataFrame({'column_name': col_names, 'dtype': col_dtypes, 'non_null_count': non_null_counts, 'null_count': null_counts})

    caller_frame = inspect.currentframe().f_back
    df_name = [var_name for var_name, var_val in caller_frame.f_locals.items() if var_val is df][0]

    print(f"DataFrame '{df_name}' has {len(df)} rows and {len(df.columns)} columns.")
    print("Here is a summary of the column names, data types and null counts:")
    return info_df

In [8]:
def find_duplicate_uid(df):
    # Get a boolean Series indicating which rows have a duplicate url
    duplicates_mask = df.duplicated(subset=['uid'], keep=False)
    
    # Use the boolean mask to select the rows with duplicate urls
    duplicate_df = df[duplicates_mask]
    
    # Return the new DataFrame with only the duplicate rows
    return duplicate_df

In [9]:
def check_for_duplicate_uid(df: pd.DataFrame,) -> bool:
    test = df.duplicated(subset=['uid']).any()
    if test is True:
        print('*** Dataset has duplicate uid ***')
    else:
        print('Dataset has no duplicate uid')
    return test

In [10]:
def group_by_url_count(df):
    # Group the DataFrame by the count of each unique URL
    grouped_df = df.groupby('url').size().reset_index(name='url_count')
    
    # Sort the DataFrame by 'url_count' in descending order
    sorted_df = grouped_df.sort_values(by='url_count', ascending=False)
    
    # Reset the index and make 'url' the index column
    sorted_df = sorted_df.set_index('url').reset_index()
    
    # Return the sorted DataFrame
    return sorted_df

In [11]:
def add_url_counts(df, url_counts):
    # Create a dictionary that maps URLs to their counts
    url_count_dict = dict(zip(url_counts['url'], url_counts['url_count']))
    
    # Map the URL counts to each URL in the DataFrame
    df['url_count'] = df['url'].map(url_count_dict)
    
    return df

In [12]:
def group_by_kw_count(df):
    # Group the DataFrame by the count of each unique URL
    grouped_df = df.groupby('kw').size().reset_index(name='kw_count')
    
    # Sort the DataFrame by 'url_count' in descending order
    sorted_df = grouped_df.sort_values(by='kw_count', ascending=False)
    
    # Reset the index and make 'url' the index column
    sorted_df = sorted_df.set_index('kw').reset_index()
    
    # Return the sorted DataFrame
    return sorted_df

In [13]:
df1 = pd.read_csv('data/webmd.csv')
df2 = pd.read_csv('data/thespruce.csv')
df3 = pd.read_csv('data/amazon.csv')
df = pd.concat([df1, df2, df3])
df = df.reset_index(drop=True)
df = df.copy(deep=True)
drop_columns = ['Current position', 'Current URL inside', 'Updated']
df = df.drop(drop_columns, axis=1)
df = df.rename(columns={'Current URL': 'url'})
df = df.rename(columns={'Keyword': 'kw'})
df = df.copy(deep=True)
df['uid'] = df['kw'] + '*' + df['url']

In [46]:
len(df)

88704

In [41]:
len(df['url'].unique())

40750

In [49]:
df['url'].nunique()

40750

In [45]:
nonunique_links = df['url'].duplicated().sum()
print(nonunique_links)

47954


In [43]:
link_counts = df['url'].value_counts()
total_link_count = link_counts.unique().sum()
print(total_link_count)

1779


In [15]:
check_for_duplicate_uid(df)

Dataset has no duplicate uid


False

In [16]:
df.head(1)

Unnamed: 0,kw,SERP features,Volume,KD,CPC,Traffic,url,uid
0,horny goat weed,Knowledge panel,76000,61,0.37,70591,https://www.webmd.com/vitamins/ai/ingredientmo...,horny goat weed*https://www.webmd.com/vitamins...


In [17]:
url_counts = group_by_url_count(df)

In [18]:
url_counts.head()

Unnamed: 0,url,url_count
0,https://www.webmd.com/covid/coronavirus-incuba...,118
1,https://www.thespruce.com/budget-friendly-kids...,84
2,https://www.webmd.com/skin-problems-and-treatm...,79
3,https://www.thespruce.com/best-outdoor-solar-l...,79
4,https://www.webmd.com/allergies/ss/slideshow-p...,68


In [19]:
df = add_url_counts(df, url_counts)

In [20]:
df.sort_values(by='url_count', ascending=False).head(2)

Unnamed: 0,kw,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
16514,what's the incubation period for covid,"Featured snippet, People also ask, Sitelinks, ...",1100,55,,387,https://www.webmd.com/covid/coronavirus-incuba...,what's the incubation period for covid*https:/...,118
28728,contagious period covid,"Featured snippet, People also ask, Sitelinks, ...",2100,51,,200,https://www.webmd.com/covid/coronavirus-incuba...,contagious period covid*https://www.webmd.com/...,118


In [21]:
show_df_info(df)

DataFrame 'df' has 90000 rows and 9 columns.
Here is a summary of the column names, data types and null counts:


Unnamed: 0,column_name,dtype,non_null_count,null_count
0,kw,object,90000,0
1,SERP features,object,89313,687
2,Volume,int64,90000,0
3,KD,int64,90000,0
4,CPC,float64,76574,13426
5,Traffic,int64,90000,0
6,url,object,90000,0
7,uid,object,90000,0
8,url_count,int64,90000,0


In [22]:
df = remove_non_ascii_kw(df)

df start length: 90000
df final length: 89917
records removed: 83


In [23]:
kw_block_list = ["penis", "vagina", "sex", "slut", "dick", "fuck", "milf", "cum ", "cunt", "rape", "porn", "boner", "cock", "whore", "bitch", "futa", "wet dreams", "cowgirl position",  "reverse cowgirl", "girl squirt", "girls squirt", "squirting-orgasm", " squirt pee", "girl's body", "dildo", "butt-plug", "good head", "smells bad down there", "xxx", "wormwood", "fetish",  "vibrator", "sandalias de mujer"]

In [24]:
df = remove_blocklist_kw(df, kw_block_list)

df start length: 89917
df final length: 88842
records removed: 1075


In [25]:
url_blocklist = ["aws.amazon", "www.amazon.com/kindle-dbs", "squirting-orgasm"]

In [26]:
df = remove_blocklist_url(df, url_blocklist)

df start length: 88842
df final length: 88704
records removed: 138


In [27]:
df = fix_nan(df)

SERP features NaN records replaced with 'None': 602
CPC NaN records replaced with 0: 13180


In [28]:
df.head(1)

Unnamed: 0,kw,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
0,horny goat weed,Knowledge panel,76000,61,0.37,70591,https://www.webmd.com/vitamins/ai/ingredientmo...,horny goat weed*https://www.webmd.com/vitamins...,10


In [29]:
show_df_info(df)

DataFrame 'df' has 88704 rows and 9 columns.
Here is a summary of the column names, data types and null counts:


Unnamed: 0,column_name,dtype,non_null_count,null_count
0,kw,object,88704,0
1,SERP features,object,88704,0
2,Volume,int64,88704,0
3,KD,int64,88704,0
4,CPC,float64,88704,0
5,Traffic,int64,88704,0
6,url,object,88704,0
7,uid,object,88704,0
8,url_count,int64,88704,0


In [30]:
df.to_csv('data/combined.csv', index=False)

In [31]:
df_10_000 = df.sample(n=10000, random_state=42)

In [32]:
df_10_000.to_csv('data/kw_10_000.csv', index=False)

In [33]:
df_1_000 = df_10_000.sample(n=1000, random_state=42)

In [34]:
df_1_000.to_csv('data/kw_1_000.csv', index=False)

In [35]:
len(df_1_000['url'].unique())

966

In [36]:
len(df_10_000['url'].unique())

7989

In [52]:
nonunique_links = df_1_000['url'].duplicated().sum()
print(nonunique_links)

34


In [55]:
unique_links = df_1_000['url'].nunique()
unique_links

966