In [1]:
import pandas as pd
from urllib.parse import urlparse

In [2]:
df = pd.read_csv('../../../data/dark_patterns_messy.csv', delimiter=";")

## Preprocessing

In [3]:
df['Domain']=df['Website Page'].apply(lambda x: urlparse(str(x)).hostname)
df=df.sort_values(by=['Domain'])
df=df.reset_index(drop=True)
df = df.replace(r'\n',' ', regex=True) 

In [4]:
df['Where in website'].value_counts()

Product detail         1037
Product Detail          195
Checkout Process        106
Cart Detail              35
Product Page             35
Cart Page                28
Other page               28
Cart page                15
Product page              4
Checkout page             2
Terms of trade page       1
Registration Page         1
Name: Where in website, dtype: int64

In [5]:
df['Pattern String'] = df['Pattern String'].str.strip()

In [6]:
df['Where in website'] = df['Where in website'].replace("Product detail","Product Detail")
df['Where in website'] = df['Where in website'].replace("Product Page","Product Detail")
df['Where in website'] = df['Where in website'].replace("Product page","Product Detail")
df['Where in website'] = df['Where in website'].replace("Checkout page","Checkout Process")
df['Where in website'] = df['Where in website'].replace("Cart Page","Cart Detail")
df['Where in website'] = df['Where in website'].replace("Cart page","Cart Detail")

In [7]:
df['Where in website'].value_counts()

Product Detail         1271
Checkout Process        108
Cart Detail              78
Other page               28
Terms of trade page       1
Registration Page         1
Name: Where in website, dtype: int64

In [8]:
pages = ["Product Detail", "Checkout Process", "Cart Detail"]
df=df[df['Where in website'].isin(pages)]

## Check any missing data
Some dark patterns do not have textual data and are only commented.

In [9]:
df[(df['Pattern String'].isnull() == True) & (df['Pattern String'].isnull() == df['Comment'].isnull())]

Unnamed: 0,Pattern String,Comment,Pattern Category,Pattern Type,Where in website,Deceptive,Website Page,Domain


## Replace pattern strings with empty string 

In [10]:
df['Pattern String'] = df['Pattern String'].fillna('')
df.shape

(1457, 8)

In [11]:
df['Pattern Category'].unique()

array(['Social Proof', 'Misdirection', 'Forced Action', 'Urgency',
       'Scarcity', 'Obstruction', 'Sneaking'], dtype=object)

In [12]:
df['Pattern Type'].unique()

array(['Activity Message', 'Pressured selling', 'Forced Enrollment',
       'Limited-time Message', 'Visual Interference', 'Low-stock Message',
       'Trick Questions', 'Pressured Selling', 'Countdown Timer',
       'Testimonials of Uncertain Origin', 'Hard to Cancel',
       'High-demand Message', 'Limited-Time Message', 'Sneak into Basket'],
      dtype=object)

In [13]:
df['Pattern Type'] = df['Pattern Type'].replace("Limited-Time Message", "Limited-time Message")
df['Pattern Type'] = df['Pattern Type'].replace("Hard to cancel", "Hard to Cancel")
df['Pattern Type'] = df['Pattern Type'].replace("Pressured selling", "Pressured Selling")

## Check if all Pattern Types are assigned to correct Pattern Categories

In [14]:
df[df['Pattern Category'] == 'Sneaking']['Pattern Type'].unique()

array(['Sneak into Basket'], dtype=object)

In [15]:
df[df['Pattern Category'] == 'Urgency']['Pattern Type'].unique()

array(['Limited-time Message', 'Countdown Timer'], dtype=object)

In [16]:
df[df['Pattern Category'] == 'Misdirection']['Pattern Type'].unique()

array(['Pressured Selling', 'Visual Interference', 'Trick Questions'],
      dtype=object)

In [17]:
df[df['Pattern Category'] == 'Social Proof']['Pattern Type'].unique()

array(['Activity Message', 'Testimonials of Uncertain Origin'],
      dtype=object)

In [18]:
df[df['Pattern Category'] == 'Urgency']['Pattern Type'].unique()

array(['Limited-time Message', 'Countdown Timer'], dtype=object)

In [19]:
df[df['Pattern Category'] == 'Scarcity']['Pattern Type'].unique()

array(['Low-stock Message', 'High-demand Message'], dtype=object)

In [20]:
df[df['Pattern Category'] == 'Obstruction']['Pattern Type'].unique()

array(['Hard to Cancel'], dtype=object)

In [21]:
df[df['Pattern Category'] == 'Forced Action']['Pattern Type'].unique()

array(['Forced Enrollment'], dtype=object)

## Remove duplicate entries

In [22]:
df['pattern_string_lower_case'] = df['Pattern String'].apply(lambda x: str(x).lower())
df['pattern_string_lower_case'] = df['pattern_string_lower_case'].replace("([0-9]+[ ,.-/:]*)+","NUMBER", regex=True)
df = df.sort_values(['Domain','pattern_string_lower_case'], ascending=(True,True))
df.drop_duplicates(
    subset=['pattern_string_lower_case',
            'Comment',
            'Pattern Category',
            'Pattern Type',
            'Where in website',
            'Domain'],
    keep='last', inplace=True)

In [23]:
df.shape

(1430, 9)

## Manual lookup for duplicates

In [24]:
# To print whole tables
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

### Duplicates only by pattern string in lower case

Manually checked and deleted from the messy dataset.

In [25]:
#df[df.duplicated(subset=['pattern_string_lower_case','Domain','Where in website'], keep=False)]

In [26]:
df.shape

(1430, 9)

### Duplicates made by multiple record of Notifikuj

In [27]:
df_dupl = df[df.duplicated(subset=['Domain'], keep=False)]
df_dupl=df_dupl[df_dupl['pattern_string_lower_case'].str.contains('notifikuj')]
df_dupl.shape

(33, 9)

## Save CSV

In [28]:
del df['pattern_string_lower_case']

In [29]:
df.shape

(1430, 8)

In [30]:
df = df.sort_values(['Pattern String', 'Domain'], ascending=(True,True))
df.to_csv('../../../data/dark_patterns.csv',index=False, sep=";")

In [31]:
df

Unnamed: 0,Pattern String,Comment,Pattern Category,Pattern Type,Where in website,Deceptive,Website Page,Domain
6,,"Registration is not necessary, but heavily pressured",Misdirection,Visual Interference,Checkout Process,No,https://apexis.sk/objednavka,apexis.sk
42,,Shows their reaction for the Testimonials of Uncertain Origin on Heureka.cz in a form like they were the actual testimonial,Misdirection,Visual Interference,Product Detail,No,https://eshop.alfafit.cz/extrifit-cfm-instant-whey-80-1000-g/,eshop.alfafit.cz
164,,Use email or sign up with social network profile.,Misdirection,Visual Interference,Checkout Process,No,https://postovnezdarma.cz/kosik/3/,postovnezdarma.cz
191,,Use email or sign up with social network profile.,Misdirection,Visual Interference,Checkout Process,No,https://skladovky.cz/kosik/3/,skladovky.cz
644,,Preselected the most expensive variation,Misdirection,Pressured Selling,Product Detail,No,https://www.fotokeramika.cz/ozdoby-na-hroby/sochy-jezise-krista/383-1004-kristus-na-krizi,www.fotokeramika.cz
806,,Cannot buy without creating an account,Forced Action,Forced Enrollment,Checkout Process,No,http://www.kovarna.amstuning.cz/authentication.php?back=order.php?step=1,www.kovarna.amstuning.cz
839,,In a popup,Misdirection,Pressured Selling,Product Detail,No,https://www.kutiliste.cz/sklenik-gutta-gardentec-classic-rozmery-2-x-3-m-zaskleni-polykarbonat-4-mm#detail-3,www.kutiliste.cz
887,,No information how the tertimonials were gathered,Social Proof,Testimonials of Uncertain Origin,Product Detail,??,https://www.lopra.cz/p/5175801-francouzske-povleceni-saten-220x200-70x90cm-sonet-grey-vyber-zapinani-zipovy-uzaver,www.lopra.cz
912,,Forced registration,Forced Action,Forced Enrollment,Checkout Process,No,https://www.martinus.cz/login?source=cart&redirect=%2Fkosik%2Fdorucenie-platba,www.martinus.cz
1356,,User is coercered to register to continue in checkout process,Misdirection,Visual Interference,Checkout Process,No,https://www.topvet.cz/index.php?&desktop=eshop&action=objednavka&id=0,www.topvet.cz
