In [1]:
import pandas as pd
from typing import List, Dict, Optional
import inspect

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 100)

# This notebook is for cleaning the input dataset
* Removal of non-ASCII kw
* Removal of offencive kw and URLs
* Dropping of columns not used

In [3]:
def remove_non_ascii_kw(df: pd.DataFrame) -> pd.DataFrame:
    """
    Removes rows from a DataFrame where the "Keyword" column contains non-ASCII characters.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
    
    Returns:
        pandas.DataFrame: The modified DataFrame with rows removed.
    """
    # Remove rows where "Keyword" contains non-ASCII characters
    df_start_len = len(df)
    df = df[df['Keyword'].apply(lambda x: all(ord(c) < 128 for c in x))]
    
    # Print some statistics
    print(f"df start length: {df_start_len}")
    print(f"df final length: {len(df)}")
    print(f"records removed: {df_start_len - len(df)}")
    
    # Return the modified DataFrame
    return df


In [4]:
def fix_nan(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replaces NaN values in the "SERP features" and "CPC" columns of a pandas DataFrame with appropriate default values.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
    
    Returns:
        pandas.DataFrame: The modified DataFrame with NaN values replaced.
    """
    # Count NaN values in "SERP features" and "CPC" columns
    serp_nan = df['SERP features'].isna().sum()
    cpc_nan = df['CPC'].isna().sum()
    
    # Replace NaN values with appropriate default values
    df['SERP features'] = df['SERP features'].fillna('None')
    df['CPC'] = df['CPC'].fillna(0)
    
    # Print some statistics
    print(f"SERP features NaN records replaced with 'None': {serp_nan}")
    print(f"CPC NaN records replaced with 0: {cpc_nan}")
    
    # Return the modified DataFrame
    return df

In [5]:
def remove_blocklist_kw(df: pd.DataFrame, kw_blocklist: List[str]) -> pd.DataFrame:
    """
    Removes rows from a pandas DataFrame where the "Keyword" column contains any of the keywords in a blocklist.

    Args:
        df (pandas.DataFrame): The DataFrame to process.
        kw_blocklist (List[str]): A list of keywords to remove from the DataFrame.

    Returns:
        pandas.DataFrame: The modified DataFrame with the specified keywords removed.
    """
    # Get the length of the input DataFrame
    df_start_len = len(df)

    # Create a regular expression pattern from the blocklist
    kw_pattern = '|'.join(kw_blocklist)

    # Create a mask for rows containing keywords in the blocklist
    kw_mask = df['Keyword'].str.contains(kw_pattern, case=False)

    # Use the mask to remove the specified keywords from the DataFrame
    df = df[~kw_mask]

    # Print some statistics
    print(f"df start length: {df_start_len}")
    print(f"df final length: {len(df)}")
    print(f"records removed: {df_start_len - len(df)}")

    # Return the modified DataFrame
    return df

In [6]:
def remove_blocklist_url(df: pd.DataFrame, url_blocklist: List[str]) -> pd.DataFrame:
    """
    Removes rows from a pandas DataFrame where the "URL" column contains any of the URLs in a blocklist.

    Args:
        df (pandas.DataFrame): The DataFrame to process.
        url_blocklist (List[str]): A list of URLs to remove from the DataFrame.

    Returns:
        pandas.DataFrame: The modified DataFrame with the specified URLs removed.
    """
    # Get the length of the input DataFrame
    df_start_len = len(df)

    # Create a regular expression pattern from the blocklist
    url_pattern = '|'.join(url_blocklist)

    # Create a mask for rows containing URLs in the blocklist
    url_mask = df['url'].str.contains(url_pattern, case=False)

    # Use the mask to remove the specified URLs from the DataFrame
    df = df[~url_mask]

    # Print some statistics
    print(f"df start length: {df_start_len}")
    print(f"df final length: {len(df)}")
    print(f"records removed: {df_start_len - len(df)}")

    # Return the modified DataFrame
    return df

In [7]:
def show_df_info(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prints information about a DataFrame, including column names, data types, and non-null counts.
    
    Args:
        df (pandas.DataFrame): The DataFrame to process.
        
    Returns:
        pandas.DataFrame: A DataFrame containing the column names, data types, and non-null counts, and null counts of the input DataFrame.
    """
    col_names = df.columns.to_list()
    col_dtypes = df.dtypes.to_list()
    non_null_counts = df.count().to_list()
    null_counts = df.isnull().sum().to_list()
    info_df = pd.DataFrame({'column_name': col_names, 'dtype': col_dtypes, 'non_null_count': non_null_counts, 'null_count': null_counts})

    caller_frame = inspect.currentframe().f_back
    df_name = [var_name for var_name, var_val in caller_frame.f_locals.items() if var_val is df][0]

    print(f"DataFrame '{df_name}' has {len(df)} rows and {len(df.columns)} columns.")
    print("Here is a summary of the column names, data types and null counts:")
    return info_df

In [8]:
def find_duplicate_uid(df):
    # Get a boolean Series indicating which rows have a duplicate url
    duplicates_mask = df.duplicated(subset=['uid'], keep=False)
    
    # Use the boolean mask to select the rows with duplicate urls
    duplicate_df = df[duplicates_mask]
    
    # Return the new DataFrame with only the duplicate rows
    return duplicate_df

In [9]:
def check_for_duplicate_uid(df: pd.DataFrame,) -> bool:
    test = df.duplicated(subset=['uid']).any()
    if test is True:
        print('*** Dataset has duplicate uid ***')
    else:
        print('Dataset has no duplicate uid')
    return test

In [11]:
def count_unique_urls(df):
    unique_urls = df['url'].nunique() # count unique values in url column
    url_counts = df['url'].value_counts() # get count of each url value
    df['url_count'] = df['url'].apply(lambda x: url_counts[x]) # create new column with counts for each record
    return df

In [12]:
df1 = pd.read_csv('data/webmd.csv')
df2 = pd.read_csv('data/thespruce.csv')
df3 = pd.read_csv('data/amazon.csv')
df = pd.concat([df1, df2, df3])
df = df.reset_index(drop=True)
df = df.copy(deep=True)
drop_columns = ['Current position', 'Current URL inside', 'Updated']
df = df.drop(drop_columns, axis=1)
df = df.rename(columns={'Current URL': 'url'})
df['uid'] = df['Keyword'] + '*' + df['url']

In [13]:
len(df['url'].unique())

41220

In [14]:
check_for_duplicate_uid(df)

Dataset has no duplicate uid


False

In [15]:
count_unique_urls(df)
len(df)

90000

In [34]:
df.sort_values('url', ascending=False)

Unnamed: 0,Keyword,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
7470,urinary tract infection treatment,"Featured snippet, People also ask, Sitelinks, ...",44000,83,0.83,834,https://www.webmd.com/women/your-guide-urinary...,urinary tract infection treatment*https://www....,29
3583,what causes a uti in a woman,"Featured snippet, People also ask, Sitelinks",21000,86,0.00,1600,https://www.webmd.com/women/your-guide-urinary...,what causes a uti in a woman*https://www.webmd...,29
8647,symptoms of urinary tract infection,"Featured snippet, Thumbnail, People also ask, ...",11000,86,1.27,731,https://www.webmd.com/women/your-guide-urinary...,symptoms of urinary tract infection*https://ww...,29
17244,symptoms of uti,"Featured snippet, Thumbnail, People also ask, ...",29000,86,0.69,369,https://www.webmd.com/women/your-guide-urinary...,symptoms of uti*https://www.webmd.com/women/yo...,29
6817,bladder infection symptoms,"Featured snippet, People also ask, Sitelinks",71000,85,0.77,905,https://www.webmd.com/women/your-guide-urinary...,bladder infection symptoms*https://www.webmd.c...,29
...,...,...,...,...,...,...,...,...,...
88874,dco,"Thumbnail, People also ask, Knowledge panel, S...",5600,14,8.75,1160,https://advertising.amazon.com/library/guides/...,dco*https://advertising.amazon.com/library/gui...,1
72415,marketing the brand,"Featured snippet, People also ask, Sitelinks",5700,34,0.00,2081,https://advertising.amazon.com/library/guides/...,marketing the brand*https://advertising.amazon...,2
78218,brand marketing,"People also ask, Sitelinks",3800,22,7.73,1606,https://advertising.amazon.com/library/guides/...,brand marketing*https://advertising.amazon.com...,2
74935,monadesa,"Sitelinks, Image pack",34000,0,0.00,1838,http://www.amazon.com/Cyril-Steffan/e/B01IBYO6...,monadesa*http://www.amazon.com/Cyril-Steffan/e...,1


In [16]:
df.tail(10)

Unnamed: 0,Keyword,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
89990,did matthew perry had stroke,"Featured snippet, People also ask, Image pack",3100,6,,1129,https://alexaanswers.amazon.com/question/0zrQ3...,did matthew perry had stroke*https://alexaansw...,1
89991,jodorowsky's dune,"People also ask, Knowledge panel, Videos",21000,54,0.42,1129,https://www.amazon.com/Jodorowskys-Dune-Chris-...,jodorowsky's dune*https://www.amazon.com/Jodor...,1
89992,avatar 4k,"People also ask, Sitelinks, Video preview",3600,5,0.27,1129,https://www.amazon.com/avatar-4k-ultra-hd-blu-...,avatar 4k*https://www.amazon.com/avatar-4k-ult...,1
89993,napkin,"Image pack, People also ask, Shopping results,...",23000,36,2.44,1128,https://www.amazon.com/napkins/s?k=napkins,napkin*https://www.amazon.com/napkins/s?k=napkins,1
89994,sharp portable air conditioner,"People also ask, Image pack, Video preview, Si...",4200,1,0.77,1128,https://www.amazon.com/sharp-portable-air-cond...,sharp portable air conditioner*https://www.ama...,1
89995,replacement tent poles,"People also ask, Sitelinks, Shopping results, ...",3700,11,0.46,1128,https://www.amazon.com/replacement-tent-poles-...,replacement tent poles*https://www.amazon.com/...,1
89996,adele 25,"Videos, Knowledge panel, People also ask, Site...",17000,42,1.91,1128,https://www.amazon.com/25-Adele/dp/B00L98V4UW,adele 25*https://www.amazon.com/25-Adele/dp/B0...,1
89997,nd grip camera strap,"Shopping results, Thumbnail, Image pack, Sitel...",3000,6,,1128,https://www.amazon.com/Camera-Hand-Strap-Stabi...,nd grip camera strap*https://www.amazon.com/Ca...,3
89998,silk bandana,"Shopping results, Thumbnail, Image pack, Peopl...",4100,6,1.12,1128,https://www.amazon.com/silk-bandana/s?k=silk+b...,silk bandana*https://www.amazon.com/silk-banda...,1
89999,bar furniture,"Shopping results, Thumbnail, Sitelinks",7800,16,1.39,1128,https://www.amazon.com/Home-Bar-Furniture/b?ie...,bar furniture*https://www.amazon.com/Home-Bar-...,3


In [17]:
df.head(1)

Unnamed: 0,Keyword,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
0,horny goat weed,Knowledge panel,76000,61,0.37,70591,https://www.webmd.com/vitamins/ai/ingredientmo...,horny goat weed*https://www.webmd.com/vitamins...,10


In [18]:
show_df_info(df)

DataFrame 'df' has 90000 rows and 9 columns.
Here is a summary of the column names, data types and null counts:


Unnamed: 0,column_name,dtype,non_null_count,null_count
0,Keyword,object,90000,0
1,SERP features,object,89313,687
2,Volume,int64,90000,0
3,KD,int64,90000,0
4,CPC,float64,76574,13426
5,Traffic,int64,90000,0
6,url,object,90000,0
7,uid,object,90000,0
8,url_count,int64,90000,0


In [19]:
df = remove_non_ascii_kw(df)

df start length: 90000
df final length: 89917
records removed: 83


In [20]:
kw_block_list = ["penis", "vagina", "sex", "slut", "dick", "fuck", "milf", "cum ", "cunt", "rape", "porn", "boner", "cock", "whore", "bitch", "futa", "wet dreams", "cowgirl position",  "reverse cowgirl", "girl squirt", "girls squirt", "squirting-orgasm", " squirt pee", "girl's body", "dildo", "butt-plug", "good head", "smells bad down there", "xxx", "wormwood", "fetish",  "vibrator", "sandalias de mujer"]

In [21]:
df = remove_blocklist_kw(df, kw_block_list)

df start length: 89917
df final length: 88842
records removed: 1075


In [22]:
url_blocklist = ["aws.amazon", "www.amazon.com/kindle-dbs", "squirting-orgasm"]

In [23]:
df = remove_blocklist_url(df, url_blocklist)

df start length: 88842
df final length: 88704
records removed: 138


In [24]:
df = fix_nan(df)

SERP features NaN records replaced with 'None': 602
CPC NaN records replaced with 0: 13180


In [25]:
df.head(1)

Unnamed: 0,Keyword,SERP features,Volume,KD,CPC,Traffic,url,uid,url_count
0,horny goat weed,Knowledge panel,76000,61,0.37,70591,https://www.webmd.com/vitamins/ai/ingredientmo...,horny goat weed*https://www.webmd.com/vitamins...,10


In [26]:
show_df_info(df)

DataFrame 'df' has 88704 rows and 9 columns.
Here is a summary of the column names, data types and null counts:


Unnamed: 0,column_name,dtype,non_null_count,null_count
0,Keyword,object,88704,0
1,SERP features,object,88704,0
2,Volume,int64,88704,0
3,KD,int64,88704,0
4,CPC,float64,88704,0
5,Traffic,int64,88704,0
6,url,object,88704,0
7,uid,object,88704,0
8,url_count,int64,88704,0


In [27]:
df.to_csv('data/combined.csv', index=False)

In [28]:
df_10_000 = df.sample(n=10000, random_state=42)

In [29]:
df_10_000.to_csv('data/kw_10_000.csv', index=False)

In [30]:
df_1_000 = df_10_000.sample(n=1000, random_state=42)

In [31]:
df_1_000.to_csv('data/kw_1_000.csv', index=False)

In [32]:
len(df_1_000['url'].unique())

966

In [33]:
len(df_10_000['url'].unique())

7989