In [1]:
import pandas as pd
import re

In [7]:
df = pd.read_csv('Data.csv')

In [3]:
df.head()

Unnamed: 0,Data
0,Watch or listen live weekdays at 8:30am MT at ...
1,Watch or listen live weekdays at 8:30am MT at ...
2,"Chubby And Hot, Always Stir The Pot!"
3,"Chubby And Hot, Always Stir The Pot!"
4,"Journalist, publisher of Rebel News — telling ..."


# Q9: Code to substitute all non-alphanumeric characters with a new line

In [4]:
non_alnum_pattern = r'[^a-zA-Z0-9\s]'  
non_alnum_records = df[df['Data'].str.contains(non_alnum_pattern, regex=True, na=False)]
non_alnum_count = len(non_alnum_records)

print(f"Number of records with non-alphanumeric characters: {non_alnum_count}\n")

if not non_alnum_records.empty:
    # Create a copy of the affected records for display purposes
    display_records = non_alnum_records.copy()
    
    # Count duplicates
    duplicate_counts = display_records['Data'].value_counts()
    
    print("=" * 80)
    print("RECORDS WITH NON-ALPHANUMERIC CHARACTERS (UNIQUE RECORDS WITH COUNTS)")
    print("=" * 80)
    
    # Track printed records to avoid repeats
    printed_records = set()
    
    for record in display_records['Data']:
        if record not in printed_records:
            printed_records.add(record)
            
            count = duplicate_counts[record]
            print("\n" + "=" * 80)
            print(f"[ORIGINAL RECORD] (Appears {count} time{'s' if count > 1 else ''})")
            
            # Replace non-alphanumeric chars with newline
            modified_record = re.sub(
                r'[^a-zA-Z0-9\s]', 
                '\n', 
                record
            )
            
            print("\n[MODIFIED RECORD (Non-alphanumeric → Newline ]:")
            print("-" * 80)
            print(repr(modified_record))
    
    # ACTUALLY MODIFY THE DATAFRAME by replacing non-alphanumeric characters with newlines
    df['Data'] = df['Data'].str.replace(non_alnum_pattern, '\n', regex=True)
    print("\nAll non-alphanumeric characters in the 'Data' column have been replaced with newlines.")
else:
    print("No records contain non-alphanumeric characters.")

Number of records with non-alphanumeric characters: 7069

RECORDS WITH NON-ALPHANUMERIC CHARACTERS (UNIQUE RECORDS WITH COUNTS)

[ORIGINAL RECORD] (Appears 2 times)

[MODIFIED RECORD (Non-alphanumeric → Newline ]:
--------------------------------------------------------------------------------
'Watch or listen live weekdays at 8\n30am MT at ryanjespersen\ncom\n Subscribe via YouTube or your favourite podcast app\n \nRealTalkRJ'

[ORIGINAL RECORD] (Appears 2 times)

[MODIFIED RECORD (Non-alphanumeric → Newline ]:
--------------------------------------------------------------------------------
'Chubby And Hot\n Always Stir The Pot\n'

[ORIGINAL RECORD] (Appears 1 time)

[MODIFIED RECORD (Non-alphanumeric → Newline ]:
--------------------------------------------------------------------------------
'Journalist\n publisher of Rebel News \n telling the other side of the story\n Awarded the Queen\ns Diamond Jubilee Medal for advancing freedom of expression\n'

[ORIGINAL RECORD] (Appears 1 tim

## Code to substitute all non-alphanumeric characters with a new line excluding time formats and .com formats in URLs and emails

In [8]:
non_alnum_pattern = r'(?!(?<=\d)[:\/](?=\d))[^a-zA-Z0-9\s]'  # Excludes time formats
non_alnum_records = df[df['Data'].str.contains(non_alnum_pattern, regex=True, na=False)]
non_alnum_count = len(non_alnum_records)

print(f"Number of records with non-alphanumeric characters: {non_alnum_count}\n")

if not non_alnum_records.empty:
    # Count duplicates
    duplicate_counts = non_alnum_records['Data'].value_counts()
    
    print("=" * 80)
    print("RECORDS WITH NON-ALPHANUMERIC CHARACTERS (UNIQUE RECORDS WITH COUNTS)")
    print("=" * 80)
    
    # Track printed records to avoid repeats
    printed_records = set()
    
    for record in non_alnum_records['Data']:
        if record not in printed_records:
            printed_records.add(record)
            
            count = duplicate_counts[record]
            print("\n" + "=" * 80)
            print(f"[ORIGINAL RECORD] (Appears {count} time{'s' if count > 1 else ''})")
            print("-" * 80)
            print(repr(record))
            
            # Replace non-alphanumeric chars with newline, preserving time/email/URL patterns
            modified_record = re.sub(
                r'(?!(?<=\d)[:\/](?=\d))'  # Don't replace : or / between digits (time/date)
                r'(?!(?:@|\.(?:com|org|net|edu|gov)\b))'  # Don't replace in emails/URLs
                r'[^a-zA-Z0-9\s]', 
                '\n', 
                record
            )
            
            print("\n[MODIFIED RECORD (Non-alphanumeric → Newline, preserving times/emails/URLs)]:")
            print("-" * 80)
            print(repr(modified_record))
            
            print("\n[MODIFIED RECORD with new lines]:")
            print("-" * 80)
            print(modified_record)
            print("=" * 80 + "\n")

    # ACTUALLY MODIFY THE DATAFRAME by replacing non-alphanumeric characters with newlines
    df['Data'] = df['Data'].str.replace(non_alnum_pattern, '\n', regex=True)
    print("\nAll non-alphanumeric characters in the 'Data' column have been replaced with newlines.")
else:
    print("No records contain non-alphanumeric characters.")

Number of records with non-alphanumeric characters: 7069

RECORDS WITH NON-ALPHANUMERIC CHARACTERS (UNIQUE RECORDS WITH COUNTS)

[ORIGINAL RECORD] (Appears 2 times)
--------------------------------------------------------------------------------
'Watch or listen live weekdays at 8:30am MT at ryanjespersen.com. Subscribe via YouTube or your favourite podcast app. #RealTalkRJ'

[MODIFIED RECORD (Non-alphanumeric → Newline, preserving times/emails/URLs)]:
--------------------------------------------------------------------------------
'Watch or listen live weekdays at 8:30am MT at ryanjespersen.com\n Subscribe via YouTube or your favourite podcast app\n \nRealTalkRJ'

[MODIFIED RECORD with new lines]:
--------------------------------------------------------------------------------
Watch or listen live weekdays at 8:30am MT at ryanjespersen.com
 Subscribe via YouTube or your favourite podcast app
 
RealTalkRJ


[ORIGINAL RECORD] (Appears 2 times)
-------------------------------------------